#Random Forest

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def preprocess_data(train_data, test_data):
    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    # We'll use integer encoding for the secondary structure: H = 0, E = 1, C = 2
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # Flatten the sequences and labels
    train_sequences_flat = train_sequences.reshape(-1, train_sequences.shape[2])  # Shape: (number of sequences * max_seq_len, 20)
    test_sequences_flat = test_sequences.reshape(-1, test_sequences.shape[2])    # Shape: (number of sequences * max_seq_len, 20)
    train_labels_flat = train_labels.flatten()  # Shape: (number of sequences * max_seq_len,)
    test_labels_flat = test_labels.flatten()    # Shape: (number of sequences * max_seq_len,)

    # Create mask to filter out padded positions (-1)
    train_mask = train_labels_flat != -1
    test_mask = test_labels_flat != -1

    # Apply the mask to filter out padding
    X_train = train_sequences_flat[train_mask]
    y_train = train_labels_flat[train_mask]
    X_test = test_sequences_flat[test_mask]
    y_test = test_labels_flat[test_mask]

    return X_train, y_train, X_test, y_test

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

def tune_random_forest(train_file_path, test_file_path, report_file_path, predictions_file_path):
    # Step 1: Load Data
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Step 2: Preprocess Data
    X_train, y_train, X_test, y_test = preprocess_data(train_data, test_data)

    # Step 3: Define Hyperparameter Grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Step 4: Initialize and Fit Grid Search
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Step 5: Predict and Evaluate
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Step 6: Save Report and Predictions
    with open(report_file_path, 'w') as f:
        f.write("Random Forest Best Parameters:\n")
        for param, value in best_params.items():
            f.write(f"{param}: {value}\n")
        f.write(f"\nTest Accuracy: {accuracy:.4f}\n")
        f.write("\nClassification Report:\n")
        f.write(classification_rep)

    pd.DataFrame(y_pred, columns=['Predictions']).to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

    return best_model


In [None]:
# Random Forest
best_rf = tune_random_forest('/content/training_data__part2_clean.csv', '/content/test_data_part2_clean.csv', 'RF_tuning_report.txt', 'RF_pred_tuning.csv')

#CNN

In [None]:
pip install keras-tuner

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

def preprocess_data_for_nn(train_data, test_data):
    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = [[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']]
    test_labels = [[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']]

    # Pad the labels to match the sequence length
    train_labels = pad_sequences(train_labels, maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences(test_labels, maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels (convert to categorical format)
    train_labels_categorical = np.where(train_labels[..., None] == -1, 0, to_categorical(train_labels, num_classes=3))
    test_labels_categorical = np.where(test_labels[..., None] == -1, 0, to_categorical(test_labels, num_classes=3))

    return train_sequences, train_labels_categorical, test_sequences, test_labels_categorical

In [None]:
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dropout, TimeDistributed, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

def preprocess_data_for_nn(train_data, test_data):
    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = [[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']]
    test_labels = [[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']]

    # Pad the labels to match the sequence length
    train_labels = pad_sequences(train_labels, maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences(test_labels, maxlen=max_seq_len, padding='post', value=-1)

    # One-hot encode the labels (convert to categorical format)
    train_labels_categorical = np.where(train_labels[..., None] == -1, 0, to_categorical(train_labels, num_classes=3))
    test_labels_categorical = np.where(test_labels[..., None] == -1, 0, to_categorical(test_labels, num_classes=3))

    return train_sequences, train_labels_categorical, test_sequences, test_labels_categorical

def build_cnn_model(hp):
    model = Sequential()
    # Adding the Input layer for consistency
    model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Conv1D(filters=hp.Choice('filters', [32, 64, 128]),
                     kernel_size=hp.Choice('kernel_size', [3, 5]),
                     activation='relu',
                     padding='same'))  # Ensures the output length matches the input length
    model.add(Dropout(hp.Choice('dropout_rate', [0.2, 0.3, 0.5])))
    model.add(TimeDistributed(Dense(hp.Choice('dense_units', [64, 128]), activation='relu')))
    model.add(TimeDistributed(Dense(3, activation='softmax')))
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4])),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

def tune_cnn(train_file_path, test_file_path, report_file_path, predictions_file_path):
    # Step 1: Load Data
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Step 2: Preprocess Data for Neural Network
    global X_train, y_train, X_test, y_test
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)

    # Step 3: Set Up Tuner
    tuner = kt.Hyperband(build_cnn_model,
                         objective='val_accuracy',
                         max_epochs=10,
                         directory='cnn_tuning',
                         project_name='cnn_hyperparameter_tuning')

    # Step 4: Tuning Search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=10,
                 callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

    # Step 5: Evaluate the Best Model
    best_model = tuner.get_best_models(num_models=1)[0]
    y_pred = np.argmax(best_model.predict(X_test), axis=-1)
    y_true = np.argmax(y_test, axis=-1)

    # Flatten predictions and true labels
    y_pred_flat = y_pred.flatten()
    y_true_flat = y_true.flatten()

    # Step 6: Save Report and Predictions
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters:\n")
        f.write(str(tuner.get_best_hyperparameters()[0].values))
        f.write("\nClassification Report:\n")
        f.write(classification_report(y_true_flat, y_pred_flat, target_names=['H', 'E', 'C']))

    pd.DataFrame(y_pred_flat, columns=['Predictions']).to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

    return best_model


In [None]:
# CNN
best_cnn = tune_cnn('/content/training_data__part2_clean.csv', '/content/test_data_part2_clean.csv', 'CNN_tuning_report.txt', 'CNN_pred_tuning.csv')

#Hybrid CNN/RNN

In [None]:
from tensorflow.keras.layers import LSTM, TimeDistributed

def tune_hybrid_cnn_rnn(train_file_path, test_file_path, report_file_path, predictions_file_path):
    # Step 1: Load Data
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Step 2: Preprocess Data for Neural Network
    global X_train, y_train, X_test, y_test
    X_train, y_train, X_test, y_test = preprocess_data_for_nn(train_data, test_data)

    # Step 3: Define the Hybrid Model
    def build_hybrid_model(hp):
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Conv1D(filters=hp.Choice('filters', [32, 64, 128]),
                         kernel_size=hp.Choice('kernel_size', [3, 5]),
                         activation='relu',
                         padding='same'))  # Ensures the output length matches the input length
        model.add(Dropout(hp.Choice('dropout_rate', [0.2, 0.3, 0.5])))
        model.add(LSTM(units=hp.Choice('lstm_units', [32, 64, 128]),
                       return_sequences=True))  # No padding argument here
        model.add(Dropout(hp.Choice('dropout_rate', [0.2, 0.3, 0.5])))
        model.add(TimeDistributed(Dense(hp.Choice('dense_units', [64, 128]), activation='relu')))
        model.add(TimeDistributed(Dense(3, activation='softmax')))
        model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4])),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model

    # Step 4: Set Up Tuner
    tuner = kt.Hyperband(build_hybrid_model,
                         objective='val_accuracy',
                         max_epochs=10,
                         directory='hybrid_tuning',
                         project_name='hybrid_hyperparameter_tuning')

    # Step 5: Tuning Search
    tuner.search(X_train, y_train, validation_split=0.1, epochs=10,
                 callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

    # Step 6: Evaluate the Best Model
    best_model = tuner.get_best_models(num_models=1)[0]
    y_pred = np.argmax(best_model.predict(X_test), axis=-1)
    y_true = np.argmax(y_test, axis=-1)

    # Flatten predictions and true labels
    y_pred_flat = y_pred.flatten()
    y_true_flat = y_true.flatten()

    # Step 7: Save Report and Predictions
    with open(report_file_path, 'w') as f:
        f.write("Best Hyperparameters:\n")
        f.write(str(tuner.get_best_hyperparameters()[0].values))
        f.write("\nClassification Report:\n")
        f.write(classification_report(y_true_flat, y_pred_flat, target_names=['H', 'E', 'C']))

    pd.DataFrame(y_pred_flat, columns=['Predictions']).to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

    return best_model


In [None]:
# Hybrid CNN-RNN
best_hybrid = tune_hybrid_cnn_rnn('/content/training_data__part2_clean.csv', '/content/test_data_part2_clean.csv', 'Hybrid_tuning_report.txt', 'Hybrid_pred_tuning.csv')