In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [4]:
def categorize_half_life(half_life):
    if half_life < 100:
        return "Unstable"
    elif half_life < 1000:
        return "Moderately Stable"
    else:
        return "Highly stable"

# Load the dataset
df = pd.read_csv('df_pepdist_2025_04_24.csv')

# Apply categorization
df['stability_class'] = df['half_life'].apply(categorize_half_life)


In [6]:
# Example: count the number of each amino acid in the sequence
amino_acids = list('ACDEFGHIKLMNPQRSTVWY')

def peptide_features(seq):
    return [seq.count(aa) for aa in amino_acids]

peptide_feature_matrix = np.array([peptide_features(seq) for seq in df['peptide_seq']])
# Create a DataFrame for the features
features_df = pd.DataFrame(peptide_feature_matrix, columns=amino_acids)
# Add the stability class to the features DataFrame
features_df['stability_class'] = df['stability_class']
# Split the data into features and target
X = features_df.drop('stability_class', axis=1)
y = features_df['stability_class']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model
rf.fit(X_train, y_train)
# Make predictions
y_pred = rf.predict(X_test)
# Print the classification report
print(classification_report(y_test, y_pred))
# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))
# Save the model
import joblib
joblib.dump(rf, 'rf_peptide_stability_model.pkl')

                   precision    recall  f1-score   support

    Highly stable       0.52      0.84      0.64        49
Moderately Stable       0.10      0.06      0.07        18
         Unstable       0.33      0.04      0.07        25

         accuracy                           0.47        92
        macro avg       0.32      0.31      0.26        92
     weighted avg       0.39      0.47      0.37        92

[[41  6  2]
 [17  1  0]
 [21  3  1]]


['rf_peptide_stability_model.pkl']