In [None]:
#CNN Function with only one convolution layer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, TimeDistributed, Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_cnn_1layer(
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path,
    epochs,
    batch_size,
    filter1,
    kernelsize,
    filter2,
    dropout1,
    dropout2,
    dist_lay,
    dist_lay2

):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Define the CNN architecture
    model = Sequential([
        Input(shape=(train_sequences.shape[1], train_sequences.shape[2])),
        Conv1D(filters=filter1, kernel_size=kernelsize, activation='relu', padding='same'),
        Dropout(dropout1),
        TimeDistributed(Dense(dist_lay, activation='relu')),
        TimeDistributed(Dense(dist_lay2, activation='softmax')),
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(
        train_sequences,
        train_labels_categorical,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        verbose=1
    )

    # Make predictions on the test set
    test_predictions = model.predict(test_sequences)
    test_predictions_labels = np.argmax(test_predictions, axis=-1)
    test_true_labels = np.argmax(test_labels_categorical, axis=-1)

    # Flatten the arrays to create a single list of predictions and true labels
    test_predictions_flat = test_predictions_labels.flatten()
    test_true_labels_flat = test_true_labels.flatten()

    # Generate the classification report
    report = classification_report(test_true_labels_flat, test_predictions_flat, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"CNN Test Accuracy: {model.evaluate(test_sequences, test_labels_categorical, verbose=0)[1]:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_true_labels_flat,
        'Predicted_Label': test_predictions_flat
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

In [None]:
#Test the one 1 layer one on the first dataset with initial parameters
train_and_evaluate_cnn_1layer(
    train_file_path='/content/training_data_clean.csv', #Training dataset
    test_file_path='/content/test_data_clean.csv',  #Test dataset
    report_file_path='CNN_initial_1layer_report.txt', #Report file containing ex: test accuracy and F1 scores
    predictions_file_path='CNN_initial_1layer_predictions.csv', #Predicted labels by the model
    epochs=10,
    batch_size=32,
    filter1=32,
    kernelsize=3,
    filter2 =64,
    dropout1=0.2,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 73ms/step - accuracy: 0.6594 - loss: 0.9134 - val_accuracy: 0.7473 - val_loss: 0.5919
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.7444 - loss: 0.5814 - val_accuracy: 0.7603 - val_loss: 0.5469
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.7526 - loss: 0.5535 - val_accuracy: 0.7615 - val_loss: 0.5340
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.7613 - loss: 0.5369 - val_accuracy: 0.7664 - val_loss: 0.5255
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.7683 - loss: 0.5261 - val_accuracy: 0.7680 - val_loss: 0.5200
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 44ms/step - accuracy: 0.7732 - loss: 0.5128 - val_accuracy: 0.7729 - val_loss: 0.5137
Epoch 7/10
[1m113/11

In [None]:
#Test 1 layer network on the second dataset with initial parameters
train_and_evaluate_cnn_1layer(
    train_file_path='/content/training_data__part2_clean.csv',  #Training dataset
    test_file_path='/content/test_data_part2_clean.csv',   #Test dataset
    report_file_path='CNN_2_1layer_report.txt', #Report file containing ex: test accuracy and F1 scores
    predictions_file_path='CNN_2_1layer_predictions.csv', #Predicted labels by the model
    epochs=10,
    batch_size=32,
    filter1=32,
    kernelsize=3,
    filter2 =64,
    dropout1=0.2,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.6801 - loss: 0.8227 - val_accuracy: 0.7724 - val_loss: 0.5282
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step - accuracy: 0.7569 - loss: 0.5460 - val_accuracy: 0.7811 - val_loss: 0.5055
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.7666 - loss: 0.5232 - val_accuracy: 0.7860 - val_loss: 0.4930
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step - accuracy: 0.7720 - loss: 0.5114 - val_accuracy: 0.7887 - val_loss: 0.4854
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.7738 - loss: 0.5076 - val_accuracy: 0.7925 - val_loss: 0.4771
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.7753 - loss: 0.5012 - val_accuracy: 0.7962 - val_loss: 0.4730
Epoch 7/10
[1m225/2

In [None]:
#CNN function with 2 convolution layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, TimeDistributed, Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_cnn(
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path,
    epochs,
    batch_size,
    filter1,
    kernelsize,
    filter2,
    dropout1,
    dropout2,
    dist_lay,
    dist_lay2

):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels
    train_labels_categorical = to_categorical(train_labels, num_classes=3)
    test_labels_categorical = to_categorical(test_labels, num_classes=3)

    # Define the CNN architecture
    model = Sequential([
        Input(shape=(train_sequences.shape[1], train_sequences.shape[2])),
        Conv1D(filters=filter1, kernel_size=kernelsize, activation='relu', padding='same'),
        Dropout(dropout1),
        Conv1D(filters=filter2, kernel_size=kernelsize, activation='relu', padding='same'),
        Dropout(dropout2),
        TimeDistributed(Dense(dist_lay, activation='relu')),
        TimeDistributed(Dense(dist_lay2, activation='softmax')),
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(
        train_sequences,
        train_labels_categorical,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        verbose=1
    )

    # Make predictions on the test set
    test_predictions = model.predict(test_sequences)
    test_predictions_labels = np.argmax(test_predictions, axis=-1)
    test_true_labels = np.argmax(test_labels_categorical, axis=-1)

    # Flatten the arrays to create a single list of predictions and true labels
    test_predictions_flat = test_predictions_labels.flatten()
    test_true_labels_flat = test_true_labels.flatten()

    # Generate the classification report
    report = classification_report(test_true_labels_flat, test_predictions_flat, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"CNN Test Accuracy: {model.evaluate(test_sequences, test_labels_categorical, verbose=0)[1]:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_true_labels_flat,
        'Predicted_Label': test_predictions_flat
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")


In [None]:
train_and_evaluate_cnn(
    train_file_path='/content/training_data_clean.csv', #Training dataset
    test_file_path='/content/test_data_clean.csv', #Test dataset
    report_file_path='CNN_initial_report',#Report file containing ex: test accuracy and F1 scores
    predictions_file_path='CNN_initial_predictions.csv',#Predicted labels by the model
    epochs=10,
    batch_size=32,
    filter1=32,
    kernelsize=3,
    filter2 =64,
    dropout1=0.2,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 64ms/step - accuracy: 0.6780 - loss: 0.8548 - val_accuracy: 0.7633 - val_loss: 0.5395
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.7768 - loss: 0.5227 - val_accuracy: 0.7757 - val_loss: 0.5009
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.7866 - loss: 0.4968 - val_accuracy: 0.7885 - val_loss: 0.4843
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.7944 - loss: 0.4828 - val_accuracy: 0.7936 - val_loss: 0.4729
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 77ms/step - accuracy: 0.8024 - loss: 0.4665 - val_accuracy: 0.8023 - val_loss: 0.4602
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 89ms/step - accuracy: 0.8040 - loss: 0.4612 - val_accuracy: 0.8073 - val_loss: 0.4506
Epoch 7/10
[1m113/1

In [None]:
#Evaluate the 2 layer network on the second data set with initial parameter values
train_and_evaluate_cnn(
    train_file_path='/content/training_data__part2_clean.csv', #Training dataset
    test_file_path='/content/test_data_part2_clean.csv',  #Test dataset
    report_file_path='CNN_2a_report', #Report file containing ex: test accuracy and F1 scores
    predictions_file_path='CNN_2a_predictions.csv', #Predicted labels by the model
    epochs=10,
    batch_size=32,
    filter1=32,
    kernelsize=3,
    filter2 =64,
    dropout1=0.2,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 41ms/step - accuracy: 0.7060 - loss: 0.7443 - val_accuracy: 0.7973 - val_loss: 0.4793
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 57ms/step - accuracy: 0.7852 - loss: 0.5001 - val_accuracy: 0.8117 - val_loss: 0.4486
Epoch 3/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 62ms/step - accuracy: 0.7961 - loss: 0.4782 - val_accuracy: 0.8242 - val_loss: 0.4242
Epoch 4/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 40ms/step - accuracy: 0.8042 - loss: 0.4593 - val_accuracy: 0.8320 - val_loss: 0.4113
Epoch 5/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 51ms/step - accuracy: 0.8120 - loss: 0.4436 - val_accuracy: 0.8365 - val_loss: 0.4002
Epoch 6/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 42ms/step - accuracy: 0.8176 - loss: 0.4339 - val_accuracy: 0.8395 - val_loss: 0.3939
Epoch 7/10
[1m22

In [None]:
#Install the tuner needed to optimize parameters
pip install keras-tuner

In [None]:
#The tuning code needed to run
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, TimeDistributed, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

def preprocess_data_for_nn(train_data, test_data):
    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = [[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']]
    test_labels = [[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']]

    # Pad the labels to match the sequence length
    train_labels = pad_sequences(train_labels, maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences(test_labels, maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels (convert to categorical format)
    train_labels_categorical = np.where(train_labels[..., None] == -1, 0, to_categorical(train_labels, num_classes=3))
    test_labels_categorical = np.where(test_labels[..., None] == -1, 0, to_categorical(test_labels, num_classes=3))

    return train_sequences, train_labels_categorical, test_sequences, test_labels_categorical

def build_cnn_model(hp):
    model = Sequential()
    # Adding the Input layer for consistency
    model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Conv1D(filters=hp.Choice('filters', [32, 64, 128]),
                     kernel_size=hp.Choice('kernel_size', [3, 5]),
                     activation='relu',
                     padding='same'))  # Ensures the output length matches the input length
    model.add(Dropout(hp.Choice('dropout_rate', [0.2, 0.3, 0.5])))
    model.add(TimeDistributed(Dense(hp.Choice('dense_units', [64, 128]), activation='relu')))
    model.add(TimeDistributed(Dense(3, activation='softmax')))
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

def tune_cnn(train_file_path, test_file_path, report_file_path, predictions_file_path):
    # Step 1: Load Data
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Step 2: Preprocess Data for Neural Network
    global X_train, y_train, X_test, y_test
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)

    # Step 3: Set Up Tuner
    tuner = kt.Hyperband(build_cnn_model,
                         objective='val_accuracy',
                         max_epochs=10,
                         directory='cnn_tuning',
                         project_name='cnn_hyperparameter_tuning')

    # Step 4: Tuning Search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=10,
                 callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

    # Step 5: Evaluate the Best Model
    best_model = tuner.get_best_models(num_models=1)[0]
    y_pred = np.argmax(best_model.predict(X_test), axis=-1)
    y_true = np.argmax(y_test, axis=-1)

    # Flatten predictions and true labels
    y_pred_flat = y_pred.flatten()
    y_true_flat = y_true.flatten()

    # Step 6: Save Report and Predictions
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters:\n")
        f.write(str(tuner.get_best_hyperparameters()[0].values))
        f.write("\nClassification Report:\n")
        f.write(classification_report(y_true_flat, y_pred_flat, target_names=['H', 'E', 'C']))

    pd.DataFrame(y_pred_flat, columns=['Predictions']).to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

    return best_model

In [None]:
# Testing the tuning for the CNN (input is training data (Dataset2), Testdata (Dataset 2), names for the CNN fine tuning report consisting of the best parameter values, And the predicted labels done by the CNN)
best_cnn = tune_cnn('/content/training_data__part2_clean.csv', '/content/test_data_part2_clean.csv', 'CNN_tuning_report.txt', 'CNN_pred_tuning.csv')

In [None]:
#Evaluate the 2 layer network on the second data set with finetuned parameters. remember to run train_and_evaluate_cnn code first
train_and_evaluate_cnn(
    train_file_path='/content/training_data__part2_clean.csv', #Training dataset
    test_file_path='/content/test_data_part2_clean.csv',  #Test dataset
    report_file_path='CNN_2a_report', #Report file containing ex: test accuracy and F1 scores
    predictions_file_path='CNN_2a_predictions.csv', #Predicted labels by the model
    epochs=10,
    batch_size=32,
    filter1=128,
    kernelsize=5,
    filter2 =128,
    dropout1=0.3,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

In [None]:
#Evaluate the 2 layer network on the third data set with finetuned parameters
train_and_evaluate_cnn(
    train_file_path='/content/training_data__part3_clean.csv',
    test_file_path='/content/test_data_part3_clean.csv',
    report_file_path='CNN_opt_done_part3_report.txt',
    predictions_file_path='CNN_opt_done_part3_predictions.csv',
    epochs=10,
    batch_size=32,
    filter1=128,
    kernelsize=5,
    filter2 =128,
    dropout1=0.3,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

In [None]:
#Evaluate the 2 layer network on the fourth data set with finetuned parameters
train_and_evaluate_cnn(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/training_data_part4.csv',
    report_file_path='CNN_part4_report.txt',
    predictions_file_path='CNN__part4_predictions.csv',
    epochs=10,
    batch_size=32,
    filter1=128,
    kernelsize=5,
    filter2 =128,
    dropout1=0.3,
    dropout2=0.3,
    dist_lay=128,
    dist_lay2=3
)

In [None]:
#Broad tuning function to do further optimization
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam

def broad_optimization(train_file_path, test_file_path, report_file_path):
    # Load and preprocess the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)

    # Get input shape based on training data
    input_shape = (X_train.shape[1], X_train.shape[2])

    # Broad search space
    def build_model(hp):
        model = Sequential()
        model.add(Input(shape=input_shape))
        model.add(Conv1D(filters=hp.Choice('filters', [32, 64, 128, 256]),
                         kernel_size=hp.Choice('kernel_size', [3, 5, 7]),
                         activation='relu',
                         padding='same'))
        model.add(Dropout(hp.Choice('dropout_rate', [0.2, 0.3, 0.5, 0.6])))
        model.add(TimeDistributed(Dense(hp.Choice('dense_units', [64, 128, 256]), activation='relu')))
        model.add(TimeDistributed(Dense(3, activation='softmax')))


        # Compile with variable learning rate
        model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    # Broad tuning using Hyperband
    tuner = kt.Hyperband(
        build_model,
        objective='val_accuracy',
        max_epochs=20,
        directory='broad_tuning',
        project_name='cnn_broad_optimization'
    )

    # Perform search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=10, batch_size=32)

    # Retrieve best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    # Save best hyperparameters to a report file
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters from Broad Search:\n")
        for param, value in best_hps.values.items():
            f.write(f"{param}: {value}\n")

    print(f"Best hyperparameters from broad search saved to {report_file_path}")

    return best_hps


In [None]:
broad_hps = broad_optimization(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    report_file_path='broad_optimization_report_CNN_part4.txt'
)


In [None]:
#Fine tuning function based on the broad tuning
def fine_tuning(train_file_path, test_file_path, broad_hps, report_file_path):
    # Fine-tuning within narrower ranges based on broad search results
    def build_fine_tune_model(hp):
        model = Sequential()
        model.add(Input(shape=input_shape))
        model.add(Conv1D(filters=hp.Int('filters', min(broad_hps['filters']-32, 96),
                                         max(broad_hps['filters']+32, 160), step=32),
                         kernel_size=hp.Choice('kernel_size', [5, 7, 9]),
                         activation='relu',
                         padding='same'))
        model.add(Dropout(hp.Float('dropout_rate', max(0.2, broad_hps['dropout_rate']-0.1),
                                    min(broad_hps['dropout_rate']+0.1, 0.4), step=0.05)))
        model.add(TimeDistributed(Dense(hp.Int('dense_units', min(broad_hps['dense_units']-32, 96),
                                               max(broad_hps['dense_units']+32, 160), step=32),
                                      activation='relu')))
        model.add(TimeDistributed(Dense(3, activation='softmax')))

        # Compile with refined learning rate
        model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', max(1e-3, broad_hps['learning_rate']/2),
                                                            min(broad_hps['learning_rate'], 1e-2))),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)
    input_shape = (X_train.shape[1], X_train.shape[2])

    # Fine-tuning with Bayesian Optimization
    tuner = kt.BayesianOptimization(
        build_fine_tune_model,
        objective='val_accuracy',
        max_trials=20,
        directory='fine_tuning',
        project_name='cnn_fine_tuning'
    )

    # Perform fine-tuning search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=20, batch_size=32)

    # Retrieve best hyperparameters
    best_fine_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    # Save best hyperparameters from fine-tuning to a report file
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters from Fine-Tuning:\n")
        for param, value in best_fine_hps.values.items():
            f.write(f"{param}: {value}\n")

    print(f"Best hyperparameters from fine-tuning saved to {report_file_path}")

    return best_fine_hps

In [None]:
#Perform finetuning with the results from the broad tuning
fine_hps = fine_tuning(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    broad_hps={
        'filters': 128,
        'kernel_size': 7,
        'dropout_rate': 0.3,
        'dense_units': 128,
        'learning_rate': 0.01
    },
    report_file_path='fine_tuning_report_part4.txt'
)


In [None]:
#Tested the fine tuned parameters
train_and_evaluate_cnn(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    report_file_path='final_optimized_cnn_report.txt',
    predictions_file_path='final_optimized_cnn_predictions.csv',
    epochs=10,
    batch_size=32,
    filter1=128,
    kernelsize=7,             # Corrected from kernel_size to match function
    filter2=128,
    dropout1=0.3,             # Corrected from dropout_rate1 to match function
    dropout2=0.3,             # Corrected from dropout_rate2 to match function
    dist_lay=128,
    dist_lay2=3
)
