In [1]:
from prody import parsePDB, buildDistMatrix
import glob
import pandas as pd
import re

In [2]:
models = glob.glob('models/*.pdb')
models

['models/LMQCWQLLA_66.pdb',
 'models/LVLFDLDED_45718.pdb',
 'models/VLISPVSIL_45835.pdb',
 'models/KLFSYGTLI_5.pdb',
 'models/GLNLEGSGV_45778.pdb',
 'models/DKIDQIIHD_45749.pdb',
 'models/TPQFLLQLN_45658.pdb',
 'models/QQGIVRQRV_45809.pdb',
 'models/GLLIVKTVL_45750.pdb',
 'models/GLKISLCGI_24.pdb',
 'models/MLIGIEILN_45901.pdb',
 'models/VLFRLENHA_12.pdb',
 'models/LTPKAQREI_1.pdb',
 'models/KFPYEGGKV_45658.pdb',
 'models/SLSNLDFRL_1414,8.pdb',
 'models/AQRWANQIR_114.pdb',
 'models/YLEPGPVTA_240.pdb',
 'models/QLREAATEA_1.pdb',
 'models/WMDMWESPM_6.pdb',
 'models/GLHCYEQLV_198.pdb',
 'models/GVSTDIPSA_45839.pdb',
 'models/TVLDHILQK_45901.pdb',
 'models/SLSSQMTST_1.pdb',
 'models/TFALKKLII_45839.pdb',
 'models/AENTNTSKS_45809.pdb',
 'models/YITDYSNDI_12.pdb',
 'models/SQYDPKELL_12.pdb',
 'models/HLTRVGPYL_174.pdb',
 'models/QLIIQAFEA_45749.pdb',
 'models/SVMLIGIEI_45748.pdb',
 'models/YLAEGHACL_804.pdb',
 'models/RTSFFLWVI_45809.pdb',
 'models/QSLGAEIAV_54.pdb',
 'models/FIVEHINAM_42.pd

In [3]:
df_pepdist = pd.DataFrame()

for model in models:

    matches = re.search('models/(\D+)_(.+).pdb',model)
    if matches:
        pep_seq = matches.groups()[0]
        # Extract numeric part of half_life and convert to float
        half_life = float(re.sub(r'[^\d.]', '', matches.groups()[1]))
    else:
        continue

    structure = parsePDB(model)
    peptide   = structure.select('chain A and backbone' )
    hla       = structure.select('chain B and backbone')

    peptide_coords = peptide.getCoords()
    hla_coords     = hla.getCoords()

    dist_matrix = buildDistMatrix(peptide_coords, hla_coords)

    dist_array = dist_matrix.flatten()

    # Concatenate pep_seq, half_life and dist_array to a single row
    row = pd.Series([pep_seq, half_life] + dist_array.tolist())
    df_pepdist = pd.concat([df_pepdist, row.to_frame().T], ignore_index=True)


@> 1571 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 1571 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1561 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1569 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1554 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1572 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1571 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1571 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1562 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1557 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1565 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1573 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1569 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1568 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1570 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1576 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1562 atoms and 1 coordinate set(s) we

In [4]:
df_pepdist.rename({0: 'peptide_seq', 1: 'half_life'}, axis=1, inplace=True)

In [5]:
# Total number of distances per row
num_dists = len(dist_array)

# Generate column names dynamically
dist_columns = [f'dist_{i+1}' for i in range(num_dists)]

# Create full column list
columns = ['peptide_seq', 'half_life'] + dist_columns

# Apply column names before saving
df_pepdist.columns = columns


In [6]:
df_pepdist.to_csv('df_pepdist_2025_04_24.csv', index=False)

In [7]:
df_pepdist = pd.read_csv('df_pepdist_2025_04_24.csv')

In [8]:
dist_min = dist_array.min()
dist_max = dist_array.max()
dist_avg = dist_array.mean()
row = pd.Series([pep_seq, half_life, dist_min, dist_max, dist_avg])


In [9]:
df = pd.read_csv("df_pepdist_2025_04_24.csv")
df.head()           # First 5 rows
df.sample(5)        # Random 5 rows
df.describe()       # Statistics


Unnamed: 0,half_life,dist_1,dist_2,dist_3,dist_4,dist_5,dist_6,dist_7,dist_8,dist_9,...,dist_26199,dist_26200,dist_26201,dist_26202,dist_26203,dist_26204,dist_26205,dist_26206,dist_26207,dist_26208
count,459.0,459.0,459.0,459.0,459.0,459.0,459.0,459.0,459.0,459.0,...,459.0,459.0,459.0,459.0,459.0,459.0,459.0,459.0,459.0,459.0
mean,24806.734641,20.117925,21.489726,21.881823,21.243883,23.026989,23.620303,24.219363,24.549384,24.477132,...,40.13711,39.233742,41.297992,41.670428,41.903333,42.305772,41.670583,41.974698,43.474557,43.959634
std,22606.687952,0.111698,0.1066,0.117302,0.141322,0.129627,0.179842,0.139275,0.15423,0.144415,...,0.364492,0.43277,0.296422,0.303534,0.353992,0.369008,0.377758,0.461254,0.40984,0.377295
min,0.4,19.839033,21.243765,21.504365,20.812181,22.536616,22.991434,23.735603,24.0963,24.047427,...,39.130793,38.305943,40.411397,41.000687,41.186274,41.507801,41.121433,41.425205,42.861095,43.163498
25%,180.0,20.041221,21.421894,21.805762,21.152268,22.937362,23.495286,24.129399,24.456007,24.386493,...,39.937399,39.024347,41.123116,41.496351,41.73974,42.139157,41.506247,41.808109,43.310716,43.795466
50%,45689.0,20.110592,21.47984,21.893502,21.255439,23.044255,23.621921,24.22202,24.556532,24.464241,...,40.121185,39.209464,41.288204,41.637222,41.867796,42.275766,41.644401,41.923626,43.429633,43.917259
75%,45809.0,20.184074,21.533471,21.957563,21.346523,23.118331,23.777185,24.307662,24.635274,24.5526,...,40.307404,39.373404,41.450315,41.823004,42.021484,42.43622,41.766824,42.042974,43.549847,44.04334
max,47348.0,20.673696,22.104466,22.446717,21.710644,23.517124,24.194957,24.995402,25.735893,25.054701,...,43.681107,43.981512,44.303825,45.36256,46.657673,47.146843,46.745992,48.156102,49.050246,48.910564


In [10]:
df['dist_min'] = df.filter(like='dist_').min(axis=1)
df['dist_max'] = df.filter(like='dist_').max(axis=1)
df['dist_mean'] = df.filter(like='dist_').mean(axis=1)
df['dist_std'] = df.filter(like='dist_').std(axis=1)

In [1]:
import pandas as pd
import numpy as np
import torch 
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Load and prepare the data
df = pd.read_csv('df_pepdist_2025_04_24.csv')

# 2. Categorize half-life values 
def categorize_half_life(half_life):
    if half_life < 100:
        return 0  # Unstable
    elif half_life < 1000:
        return 1  # Moderately stable
    else:
        return 2  # Stable

df['stability_class'] = df['half_life'].apply(categorize_half_life)



In [2]:
# 3. Separate features and target
# Get peptide sequences
peptide_sequences = df['peptide_seq'].values

# Get distance measurements
distance_columns = [col for col in df.columns if col.startswith('dist_')]
distance_features = df[distance_columns].values

# Get target variable
y = df['stability_class'].values

In [3]:
# 4. Encode peptide sequences
def encode_peptide(peptide_seq):
    # One-hot encoding for amino acids
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    encoding = np.zeros((len(peptide_seq), len(amino_acids)))
    
    for i, aa in enumerate(peptide_seq):
        if aa in amino_acids:
            encoding[i, amino_acids.index(aa)] = 1
    
    return encoding.flatten()  # Returns a 1D array

# Apply encoding to all peptides
peptide_features = np.array([encode_peptide(seq) for seq in peptide_sequences])

In [4]:
# 5. Normalize distance features
scaler = StandardScaler()
normalized_distances = scaler.fit_transform(distance_features)

# 6. Create PyTorch Dataset
class PMHCDataset(Dataset):
    def __init__(self, peptide_features, distance_features, labels):
        self.peptide_features = torch.FloatTensor(peptide_features)
        self.distance_features = torch.FloatTensor(distance_features)
        self.labels = torch.LongTensor(labels)
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        return {
            'peptide': self.peptide_features[idx], 
            'distances': self.distance_features[idx],
            'label': self.labels[idx]
        }
        

In [5]:
# 7. Split data and create dataloaders
X_peptide_train, X_peptide_test, X_dist_train, X_dist_test, y_train, y_test = train_test_split(
    peptide_features, normalized_distances, y, test_size=0.2, stratify=y, random_state=42)

# Create datasets
train_dataset = PMHCDataset(X_peptide_train, X_dist_train, y_train)
test_dataset = PMHCDataset(X_peptide_test, X_dist_test, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [6]:
# 8. Define your PyTorch model
import torch.nn as nn
import torch.optim as optim

class PMHCStabilityModel(nn.Module):
    def __init__(self, peptide_dim, distance_dim, hidden_dim=64, num_classes=3):
        super(PMHCStabilityModel, self).__init__()
        
        # Peptide processing branch
        self.peptide_network = nn.Sequential(
            nn.Linear(peptide_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        # Distance processing branch
        self.distance_network = nn.Sequential(
            nn.Linear(distance_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Combined processing
        self.combined_network = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )
    def forward(self, peptide, distances):
        peptide_features = self.peptide_network(peptide)
        distance_features = self.distance_network(distances)
        
        combined = torch.cat((peptide_features, distance_features), dim=1)
        output = self.combined_network(combined)
        
        return output

In [7]:
# 9. Initialize model
model = PMHCStabilityModel(
    peptide_dim=peptide_features.shape[1],  # Size of one-hot encoded peptide
    distance_dim=normalized_distances.shape[1]  # Number of distance features
)

In [8]:
# 10. Define training function
def train_model(model, train_loader, test_loader, epochs=100):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)
    
    train_losses = []
    test_accuracies = []
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(batch['peptide'], batch['distances'])
            loss = criterion(outputs, batch['label'])
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)
        scheduler.step(epoch_loss)
        
        # Evaluation phase
        if epoch % 5 == 0 or epoch == epochs-1:
            model.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for batch in test_loader:
                    outputs = model(batch['peptide'], batch['distances'])
                    _, predicted = torch.max(outputs, 1)
                    total += batch['label'].size(0)
                    correct += (predicted == batch['label']).sum().item()
            
            accuracy = 100 * correct / total
            test_accuracies.append(accuracy)
            
            print(f'Epoch {epoch+1}/{epochs}: '
                  f'Loss: {epoch_loss:.4f}, '
                  f'Test Accuracy: {accuracy:.2f}%')
    
    return model, train_losses, test_accuracies


In [9]:
# 11. Train the model
trained_model, losses, accuracies = train_model(model, train_loader, test_loader, epochs=100)

# 12. Final evaluation
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            outputs = model(batch['peptide'], batch['distances'])
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch['label'].cpu().numpy())
    
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))
    return all_preds, all_labels

# Evaluate the trained model
predictions, true_labels = evaluate_model(trained_model, test_loader)

Epoch 1/100: Loss: 1.9664, Test Accuracy: 64.13%
Epoch 6/100: Loss: 0.9061, Test Accuracy: 59.78%
Epoch 11/100: Loss: 0.7589, Test Accuracy: 67.39%
Epoch 16/100: Loss: 0.6377, Test Accuracy: 63.04%
Epoch 21/100: Loss: 0.6342, Test Accuracy: 65.22%
Epoch 26/100: Loss: 0.4989, Test Accuracy: 69.57%
Epoch 31/100: Loss: 0.4389, Test Accuracy: 60.87%
Epoch 36/100: Loss: 0.3691, Test Accuracy: 65.22%
Epoch 41/100: Loss: 0.3799, Test Accuracy: 65.22%
Epoch 46/100: Loss: 0.3467, Test Accuracy: 64.13%
Epoch 51/100: Loss: 0.2009, Test Accuracy: 60.87%
Epoch 56/100: Loss: 0.2167, Test Accuracy: 63.04%
Epoch 61/100: Loss: 0.2920, Test Accuracy: 61.96%
Epoch 66/100: Loss: 0.1953, Test Accuracy: 68.48%
Epoch 71/100: Loss: 0.1468, Test Accuracy: 66.30%
Epoch 76/100: Loss: 0.0903, Test Accuracy: 65.22%
Epoch 81/100: Loss: 0.0856, Test Accuracy: 65.22%
Epoch 86/100: Loss: 0.0971, Test Accuracy: 65.22%
Epoch 91/100: Loss: 0.0513, Test Accuracy: 67.39%
Epoch 96/100: Loss: 0.0371, Test Accuracy: 68.48%
Ep

In [4]:
# Save a lighter CSV with the first 50 columns for testing Rainbow CSV
import pandas as pd  # Ensure pandas is imported
df = pd.read_csv('df_pepdist_2025_04_24.csv')  # Load the CSV into a DataFrame
reduced_df = df.iloc[:, :50]  # Select the first 50 columns
# Save the file to the current working directory
reduced_df.to_csv("df_pepdist_small_preview.csv", index=False)
