In [8]:
# 1) importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from Bio.SeqUtils.ProtParam import ProteinAnalysis


In [9]:
#2)loading the dataset and defining quantiles
def categorize_half_life(half_life):
    if half_life < 12:
        return "Unstable"
    elif half_life < 2500:
        return "Moderately Stable"
    else:
        return "Highly stable"


df = pd.read_csv('/Users/ozgetimur/Desktop/halfpepstab/pepdist_final.csv')

# Apply categorization
df['stability_class'] = df['half_life'].apply(categorize_half_life)


In [13]:

from scipy.stats import skew
import numpy as np
import pandas as pd

#3) Calculating the average molecular weight
def calculate_molecular_weight(seq):
    molecular_weight = {
        'A': 89.09, 'R': 174.20, 'N': 132.12, 'D': 133.10, 'C': 121.15,
        'Q': 146.15, 'E': 147.13, 'G': 75.07, 'H': 155.16, 'I': 131.17,
        'L': 131.17, 'K': 146.19, 'M': 149.21, 'F': 165.19, 'P': 115.13,
        'S': 105.09, 'T': 119.12, 'W': 204.23, 'Y': 181.19, 'V': 117.15
    }
    return np.mean([molecular_weight[aa] for aa in seq])


#4) Calculating the average hydrophobicity
def calculate_hydrophobicity(seq):
    hydrophobicity = {
        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
    }
    return np.mean([hydrophobicity[aa] for aa in seq])

#5) Calculating the average charge
def calculate_charge(seq):
    charge = {
        'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
        'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
        'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
        'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
    }
    return np.mean([charge[aa] for aa in seq])


amino_acids = list('ACDEFGHIKLMNPQRSTVWY')

def peptide_features(seq):
    return [seq.count(aa) for aa in amino_acids]

peptide_feature_matrix = np.array([peptide_features(seq) for seq in df['peptide_seq']])
# 6) a DataFrame for the features
features_df = pd.DataFrame(peptide_feature_matrix, columns=amino_acids)
# Adding the stability class to the features DataFrame
features_df['stability_class'] = df['stability_class']

# 7) Splitting the data into features and target
X = features_df.drop('stability_class', axis=1)
y = features_df['stability_class']
# 8)Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#9) Initializing the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# 10)Fitting the model
rf.fit(X_train, y_train)
# Making predictions
y_pred = rf.predict(X_test)
# Printing the classification report
print(classification_report(y_test, y_pred))
# Printing the confusion matrix
print(confusion_matrix(y_test, y_pred))


                   precision    recall  f1-score   support

    Highly stable       0.45      0.39      0.42        76
Moderately Stable       0.76      0.91      0.83       210
         Unstable       0.50      0.33      0.39        92

         accuracy                           0.67       378
        macro avg       0.57      0.55      0.55       378
     weighted avg       0.64      0.67      0.64       378

[[ 30  28  18]
 [  6 192  12]
 [ 31  31  30]]
