#CNN Testing

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, TimeDistributed, Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_cnn(
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path,
    filters=128,
    kernel_size=5,
    dropout_rate=0.3,
    dense_units=128,
    learning_rate=0.001,
    epochs=10,
    batch_size=32
):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Define the CNN architecture based on the provided parameters
    model = Sequential([
        Input(shape=(train_sequences.shape[1], train_sequences.shape[2])),
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same'),
        Dropout(dropout_rate),
        TimeDistributed(Dense(dense_units, activation='relu')),
        TimeDistributed(Dense(3, activation='softmax'))
    ])

    # Compile the model with the provided learning rate
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    model.fit(
        train_sequences,
        train_labels_categorical,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        verbose=1
    )

    # Make predictions on the test set
    test_predictions = model.predict(test_sequences)
    test_predictions_labels = np.argmax(test_predictions, axis=-1)
    test_true_labels = np.argmax(test_labels_categorical, axis=-1)

    # Flatten the arrays to create a single list of predictions and true labels
    test_predictions_flat = test_predictions_labels.flatten()
    test_true_labels_flat = test_true_labels.flatten()

    # Generate the classification report
    report = classification_report(test_true_labels_flat, test_predictions_flat, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"CNN Test Accuracy: {model.evaluate(test_sequences, test_labels_categorical, verbose=0)[1]:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_true_labels_flat,
        'Predicted_Label': test_predictions_flat
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

In [None]:
train_and_evaluate_cnn(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/training_data_part4.csv',
    report_file_path='CNN_part4_report.txt',
    predictions_file_path='CNN__part4_predictions.csv'
)

In [None]:
pip install keras-tuner

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

def preprocess_data_for_nn(train_data, test_data):
    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = [[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']]
    test_labels = [[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']]

    # Pad the labels to match the sequence length
    train_labels = pad_sequences(train_labels, maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences(test_labels, maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels (convert to categorical format)
    train_labels_categorical = np.where(train_labels[..., None] == -1, 0, to_categorical(train_labels, num_classes=3))
    test_labels_categorical = np.where(test_labels[..., None] == -1, 0, to_categorical(test_labels, num_classes=3))

    return train_sequences, train_labels_categorical, test_sequences, test_labels_categorical

#Broad optimization

In [None]:
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam

def broad_optimization(train_file_path, test_file_path, report_file_path):
    # Load and preprocess the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)

    # Get input shape based on training data
    input_shape = (X_train.shape[1], X_train.shape[2])

    # Broad search space
    def build_model(hp):
        model = Sequential()
        model.add(Input(shape=input_shape))
        model.add(Conv1D(filters=hp.Choice('filters', [32, 64, 128, 256]),
                         kernel_size=hp.Choice('kernel_size', [3, 5, 7]),
                         activation='relu',
                         padding='same'))
        model.add(Dropout(hp.Choice('dropout_rate', [0.2, 0.3, 0.5, 0.6])))
        model.add(TimeDistributed(Dense(hp.Choice('dense_units', [64, 128, 256]), activation='relu')))
        model.add(TimeDistributed(Dense(3, activation='softmax')))

        # Compile with variable learning rate
        model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    # Broad tuning using Hyperband
    tuner = kt.Hyperband(
        build_model,
        objective='val_accuracy',
        max_epochs=20,
        directory='broad_tuning',
        project_name='cnn_broad_optimization'
    )

    # Perform search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=10, batch_size=32)

    # Retrieve best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    # Save best hyperparameters to a report file
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters from Broad Search:\n")
        for param, value in best_hps.values.items():
            f.write(f"{param}: {value}\n")

    print(f"Best hyperparameters from broad search saved to {report_file_path}")

    return best_hps

In [None]:
broad_hps = broad_optimization(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    report_file_path='broad_optimization_report_CNN_part4.txt'
)


#Fine optimizion

In [None]:
def fine_tuning(train_file_path, test_file_path, broad_hps, report_file_path):
    # Fine-tuning within narrower ranges based on broad search results
    def build_fine_tune_model(hp):
        model = Sequential()
        model.add(Input(shape=input_shape))
        model.add(Conv1D(filters=hp.Int('filters', min(broad_hps['filters']-32, 96),
                                         max(broad_hps['filters']+32, 160), step=32),
                         kernel_size=hp.Choice('kernel_size', [5, 7, 9]),
                         activation='relu',
                         padding='same'))
        model.add(Dropout(hp.Float('dropout_rate', max(0.2, broad_hps['dropout_rate']-0.1),
                                    min(broad_hps['dropout_rate']+0.1, 0.4), step=0.05)))
        model.add(TimeDistributed(Dense(hp.Int('dense_units', min(broad_hps['dense_units']-32, 96),
                                               max(broad_hps['dense_units']+32, 160), step=32),
                                      activation='relu')))
        model.add(TimeDistributed(Dense(3, activation='softmax')))

        # Compile with refined learning rate
        model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', max(1e-3, broad_hps['learning_rate']/2),
                                                            min(broad_hps['learning_rate'], 1e-2))),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)
    input_shape = (X_train.shape[1], X_train.shape[2])

    # Fine-tuning with Bayesian Optimization
    tuner = kt.BayesianOptimization(
        build_fine_tune_model,
        objective='val_accuracy',
        max_trials=20,
        directory='fine_tuning',
        project_name='cnn_fine_tuning'
    )

    # Perform fine-tuning search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=20, batch_size=32)

    # Retrieve best hyperparameters
    best_fine_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    # Save best hyperparameters from fine-tuning to a report file
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters from Fine-Tuning:\n")
        for param, value in best_fine_hps.values.items():
            f.write(f"{param}: {value}\n")

    print(f"Best hyperparameters from fine-tuning saved to {report_file_path}")

    return best_fine_hps

In [None]:
fine_hps = fine_tuning(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    broad_hps={
        'filters': 128,
        'kernel_size': 7,
        'dropout_rate': 0.3,
        'dense_units': 128,
        'learning_rate': 0.01
    },
    report_file_path='fine_tuning_report_part4.txt'
)