In [None]:
#Final trained CNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, TimeDistributed, Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_cnn(
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path,
    filters=128,
    kernel_size=5,
    dropout_rate=0.3,
    dense_units=128,
    learning_rate=0.001,
    epochs=10,
    batch_size=32
):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Define the CNN architecture based on the provided parameters
    model = Sequential([
        Input(shape=(train_sequences.shape[1], train_sequences.shape[2])),
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same'),
        Dropout(dropout_rate),
        TimeDistributed(Dense(dense_units, activation='relu')),
        TimeDistributed(Dense(3, activation='softmax'))
    ])

    # Compile the model with the provided learning rate
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    model.fit(
        train_sequences,
        train_labels_categorical,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        verbose=1
    )

    # Make predictions on the test set
    test_predictions = model.predict(test_sequences)
    test_predictions_labels = np.argmax(test_predictions, axis=-1)
    test_true_labels = np.argmax(test_labels_categorical, axis=-1)

    # Flatten the arrays to create a single list of predictions and true labels
    test_predictions_flat = test_predictions_labels.flatten()
    test_true_labels_flat = test_true_labels.flatten()

    # Generate the classification report
    report = classification_report(test_true_labels_flat, test_predictions_flat, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"CNN Test Accuracy: {model.evaluate(test_sequences, test_labels_categorical, verbose=0)[1]:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_true_labels_flat,
        'Predicted_Label': test_predictions_flat
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

In [None]:
#Tested the fine tuned parameters
train_and_evaluate_cnn(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    report_file_path='final_optimized_cnn_report.txt',
    predictions_file_path='final_optimized_cnn_predictions.csv',
    filters=128,             # Optimal number of filters
    kernel_size=7,           # Optimal kernel size
    dropout_rate=0.3,        # Optimal dropout rate
    dense_units=128,         # Optimal dense layer units
    learning_rate=0.01,      # Optimal learning rate
    epochs=20,               # Optimal number of epochs
    batch_size=32            # Standard batch size; adjust if needed
)

#Roc curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np

def plot_roc_curves(model, X_test, y_test_onehot, class_names=['H', 'E', 'C']):
    # Convert one-hot encoded labels to single integer labels
    y_test_int = np.argmax(y_test_onehot, axis=1)  # Ensures 1D array
    y_score = model.predict(X_test)

    # Number of classes
    n_classes = len(class_names)

    plt.figure(figsize=(10, 8))

    # Compute ROC curve and ROC area for each class
    for i in range(n_classes):
        # Binarize y_test for the ith class
        y_test_bin = (y_test_int == i).astype(int)  # Binary labels for the current class (one-vs-rest)

        # Calculate ROC curve and AUC for the current class
        fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_score[:, i].ravel())  # Flatten arrays to ensure 1D
        roc_auc = auc(fpr, tpr)

        # Plot ROC curve
        plt.plot(fpr, tpr, lw=2, label=f'ROC curve for {class_names[i]} (area = {roc_auc:.2f})')

    # Plot the diagonal line for random chance
    plt.plot([0, 1], [0, 1], 'k--', lw=2, label="Chance")

    # Customize plot appearance
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Each Class')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
# Train the model and get test data
model, X_test, y_test_onehot = train_and_evaluate_cnn(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    report_file_path='final_optimized_cnn_report.txt',
    predictions_file_path='final_optimized_cnn_predictions.csv',
    filters=128,
    kernel_size=7,
    dropout_rate=0.3,
    dense_units=128,
    learning_rate=0.01,
    epochs=20,
    batch_size=32
)

# Plot ROC Curves
plot_roc_curves(model, X_test, y_test_onehot, class_names=['H', 'E', 'C'])
