In [5]:
#Random Forest model as a function with all the parameters as well as input files and output file names

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def train_and_evaluate_rf(
    train_file_path,
    test_file_path,
    report_file_path,
    predictions_file_path,
    n_estimators,
    max_depth,
    min_samples_split,
    min_samples_leaf
):
    # Load the datasets
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # Flatten the sequences and labels
    train_sequences_flat = train_sequences.reshape(-1, train_sequences.shape[2])
    test_sequences_flat = test_sequences.reshape(-1, test_sequences.shape[2])
    train_labels_flat = train_labels.flatten()
    test_labels_flat = test_labels.flatten()

    # Create mask to filter out padded positions (-1)
    train_mask = train_labels_flat != -1
    test_mask = test_labels_flat != -1

    # Apply the mask to filter out padding
    train_sequences_flat = train_sequences_flat[train_mask]
    train_labels_flat = train_labels_flat[train_mask]
    test_sequences_flat = test_sequences_flat[test_mask]
    test_labels_flat = test_labels_flat[test_mask]

    # Train the Random Forest model with the original settings
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    rf_model.fit(train_sequences_flat, train_labels_flat)

    # Make predictions on the test set
    test_predictions = rf_model.predict(test_sequences_flat)

    # Generate the classification report
    report = classification_report(test_labels_flat, test_predictions, target_names=['H', 'E', 'C'])

    # Save the classification report to a text file
    with open(report_file_path, 'w') as f:
        f.write(f"Random Forest Test Accuracy: {rf_model.score(test_sequences_flat, test_labels_flat):.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # Save the predictions along with the true labels to a CSV file
    results_df = pd.DataFrame({
        'True_Label': test_labels_flat,
        'Predicted_Label': test_predictions
    })
    results_df.to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

In [27]:
#Initial values of paramets and first dataset
train_and_evaluate_rf(
    train_file_path='/content/training_data_clean.csv', #training dataset
    test_file_path='/content/test_data_clean.csv',  #Test dataset
    report_file_path='RF_Initial_report.txt', #Report consisting of ex acuuracy and F1 scores
    predictions_file_path='RF_initial_predictions.csv', #The predictions performed by the model
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report saved to RF_Initial_report.txt
Predictions saved to RF_initial_predictions.csv


In [29]:
#Initial values and second dataset
train_and_evaluate_rf(
    train_file_path='/content/training_data__part2_clean.csv',   #training dataset
    test_file_path='/content/test_data_part2_clean.csv',  #Test dataset
    report_file_path='RF_part2a_report.txt',  #Report consisting of ex acuuracy and F1 scores
    predictions_file_path='RF_part2a_predictions.csv',  #The predictions performed by the model
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report saved to RF_part2a_report.txt
Predictions saved to RF_part2a_predictions.csv


In [1]:
#Code needed for the parameter tuning function

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def preprocess_data(train_data, test_data):
    # List of amino acids (for one-hot encoding)
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Function to one-hot encode a sequence
    def one_hot_encode(seq, aa_list):
        encoding = np.zeros((len(seq), len(aa_list)), dtype=int)
        for i, aa in enumerate(seq):
            if aa in aa_list:
                encoding[i, aa_list.index(aa)] = 1
        return encoding

    # Encode the sequences for training and testing
    train_encoded = [one_hot_encode(seq, amino_acids) for seq in train_data['seq']]
    test_encoded = [one_hot_encode(seq, amino_acids) for seq in test_data['seq']]

    # Find the maximum sequence length in the training and testing datasets
    max_seq_len = max(max(len(seq) for seq in train_data['seq']),
                      max(len(seq) for seq in test_data['seq']))

    # Pad the sequences to the maximum length
    train_sequences = pad_sequences(train_encoded, maxlen=max_seq_len, padding='post', dtype='float32')
    test_sequences = pad_sequences(test_encoded, maxlen=max_seq_len, padding='post', dtype='float32')

    # Encode the secondary structures as target labels
    # We'll use integer encoding for the secondary structure: H = 0, E = 1, C = 2
    sst3_mapping = {'H': 0, 'E': 1, 'C': 2}
    train_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in train_data['sst3']],
                                 maxlen=max_seq_len, padding='post', value=-1)
    test_labels = pad_sequences([[sst3_mapping[ss] for ss in sst] for sst in test_data['sst3']],
                                maxlen=max_seq_len, padding='post', value=-1)

    # Flatten the sequences and labels
    train_sequences_flat = train_sequences.reshape(-1, train_sequences.shape[2])  # Shape: (number of sequences * max_seq_len, 20)
    test_sequences_flat = test_sequences.reshape(-1, test_sequences.shape[2])    # Shape: (number of sequences * max_seq_len, 20)
    train_labels_flat = train_labels.flatten()  # Shape: (number of sequences * max_seq_len,)
    test_labels_flat = test_labels.flatten()    # Shape: (number of sequences * max_seq_len,)

    # Create mask to filter out padded positions (-1)
    train_mask = train_labels_flat != -1
    test_mask = test_labels_flat != -1

    # Apply the mask to filter out padding
    X_train = train_sequences_flat[train_mask]
    y_train = train_labels_flat[train_mask]
    X_test = test_sequences_flat[test_mask]
    y_test = test_labels_flat[test_mask]

    return X_train, y_train, X_test, y_test

In [2]:
#Hyper parameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

def tune_random_forest(train_file_path, test_file_path, report_file_path, predictions_file_path, param_grid):
    # Step 1: Load Data
    train_data = pd.read_csv(train_file_path)
    test_data = pd.read_csv(test_file_path)

    # Step 2: Preprocess Data
    X_train, y_train, X_test, y_test = preprocess_data(train_data, test_data)

    # Step 3: Define Hyperparameter Grid

    # Step 4: Initialize and Fit Grid Search
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Step 5: Predict and Evaluate
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Get the best parameters
    best_params = grid_search.best_params_

    # Step 6: Save Report and Predictions
    with open(report_file_path, 'w') as f:
        f.write("Random Forest Best Parameters:\n")
        for param, value in best_params.items():
            f.write(f"{param}: {value}\n")
        f.write(f"\nTest Accuracy: {accuracy:.4f}\n")
        f.write("\nClassification Report:\n")
        f.write(classification_rep)

    pd.DataFrame(y_pred, columns=['Predictions']).to_csv(predictions_file_path, index=False)

    print(f"Classification report saved to {report_file_path}")
    print(f"Predictions saved to {predictions_file_path}")

    return best_model


In [3]:
# Random Forest, broad tuning

param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

best_rf = tune_random_forest('/content/training_data__part2_clean.csv', '/content/test_data_part2_clean.csv', 'RF_tuning_report.txt', 'RF_pred_tuning.csv', param_grid)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report saved to RF_tuning_report.txt
Predictions saved to RF_pred_tuning.csv


In [24]:
#Post tuning testing (evaluating on dataset2)
train_and_evaluate_rf(
    train_file_path='/content/training_data__part2_clean.csv',
    test_file_path='/content/test_data_part2_clean.csv',
    report_file_path='RF_part2b_report.txt',
    predictions_file_path='RF_part2b_predictions.csv',
    n_estimators=50,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report saved to RF_part2b_report.txt
Predictions saved to RF_part2b_predictions.csv


In [25]:
#Post tuning testing (evaluating on dataset 3)
train_and_evaluate_rf(
    train_file_path='/content/training_data__part3_clean.csv',
    test_file_path='/content/test_data_part3_clean.csv',
    report_file_path='RF_part3_report.txt',
    predictions_file_path='RF_part3_predictions.csv',
    n_estimators=50,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report saved to RF_part3_report.txt
Predictions saved to RF_part3_predictions.csv


In [26]:
#Post tuning testing (evaluating on dataset 4)
train_and_evaluate_rf(
    train_file_path='/content/training_data_part4.csv',
    test_file_path='/content/test_data_part4.csv',
    report_file_path='RF_part4_report.txt',
    predictions_file_path='RF_part4_predictions.csv',
    n_estimators=50,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report saved to RF_part4_report.txt
Predictions saved to RF_part4_predictions.csv
