In [1]:
import os
import torch
from tqdm import tqdm
# Set device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [2]:
# Path to the folder containing embeddings
EMBEDDING_FOLDER = 'protein_embeddings'

# Dictionary to store embeddings per PDB ID
embedding_dict_pdb_only = {}

# List all embedding files
embedding_files = [f for f in os.listdir(EMBEDDING_FOLDER) if f.endswith('.pt')]

# Build mapping from PDB ID to embedding (selecting smallest embedding)
for filename in embedding_files:
    # Extract PDB ID and chain ID from filename
    basename = os.path.splitext(filename)[0]
    if '_' in basename:
        pdb_id, chain_id = basename.split('_')
    else:
        pdb_id = basename
        chain_id = ''
    
    # Load the embedding onto CPU
    embedding_path = os.path.join(EMBEDDING_FOLDER, filename)
    embedding = torch.load(embedding_path, map_location='cpu')  # Load onto CPU

    # Ensure embedding is a tensor
    if not isinstance(embedding, torch.Tensor):
        print(f"Invalid embedding for {pdb_id}_{chain_id}")
        continue
    
    # Check if the PDB ID is already in the dictionary
    if pdb_id not in embedding_dict_pdb_only:
        # Add the embedding to the dictionary
        embedding_dict_pdb_only[pdb_id] = embedding
    else:
        # If multiple chains, select the smallest embedding (shortest sequence)
        current_embedding = embedding_dict_pdb_only[pdb_id]
        if embedding.numel() < current_embedding.numel():
            embedding_dict_pdb_only[pdb_id] = embedding


  embedding = torch.load(embedding_path, map_location='cpu')  # Load onto CPU


In [3]:
interaction_pairs = []
with open('interactions_data.txt', 'r') as file:
    for line in file:
        pdb_id1, pdb_id2, interaction = line.strip().split()
        interaction = int(interaction)
        interaction_pairs.append((pdb_id1.upper(), pdb_id2.upper(), interaction))


In [4]:
X = []
y = []

missing_embeddings = set()
total_pairs = len(interaction_pairs)

for pdb_id1, pdb_id2, interaction in tqdm(interaction_pairs, desc="Processing pairs"):
    # Check if embeddings for both PDB IDs are available
    if pdb_id1 in embedding_dict_pdb_only and pdb_id2 in embedding_dict_pdb_only:
        # Retrieve embeddings
        embedding1 = embedding_dict_pdb_only[pdb_id1]
        embedding2 = embedding_dict_pdb_only[pdb_id2]
        
        # Concatenate embeddings
        combined_embedding = torch.cat((embedding1, embedding2), dim=0)
        
        # Append to the list
        X.append(combined_embedding)
        y.append(interaction)
    else:
        # Record missing embeddings
        if pdb_id1 not in embedding_dict_pdb_only:
            missing_embeddings.add(pdb_id1)
        if pdb_id2 not in embedding_dict_pdb_only:
            missing_embeddings.add(pdb_id2)

print(f"Total pairs processed: {len(X)}")
print(f"Total pairs missing embeddings: {total_pairs - len(X)}")
print(f"Number of unique proteins missing embeddings: {len(missing_embeddings)}")


Processing pairs: 100%|██████████| 10004/10004 [00:00<00:00, 108561.04it/s]

Total pairs processed: 9806
Total pairs missing embeddings: 198
Number of unique proteins missing embeddings: 29





In [5]:
from torch.utils.data import Dataset

class ProteinInteractionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx].float()  # Convert to float32
        y = self.y[idx]
        return x, y

        
from sklearn.preprocessing import StandardScaler

# Collect all embeddings into a single tensor
all_embeddings = torch.stack([x.cpu() for x in X])

# Convert to NumPy for normalization
all_embeddings_np = all_embeddings.numpy()

# Fit the scaler on all data (or training data only)
scaler = StandardScaler()
scaler.fit(all_embeddings_np)

# Normalize embeddings and ensure dtype is float32
X_normalized = [
    torch.tensor(scaler.transform(x.cpu().unsqueeze(0)), dtype=torch.float32).squeeze(0).to(device)
    for x in X
]

# Update the dataset with normalized embeddings
dataset = ProteinInteractionDataset(X_normalized, y)



In [6]:
from torch.utils.data import random_split

# Define lengths for train, validation, and test sets
total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")
print(f"Test examples: {len(test_dataset)}")


Training examples: 7844
Validation examples: 980
Test examples: 982


In [7]:
from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [8]:
input_dim = X[0].shape[0]
print(f"Input dimension: {input_dim}")


Input dimension: 3072


In [9]:
import torch.nn as nn

class PPIClassifier(nn.Module):
    def __init__(self, input_dim):
        super(PPIClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()  # For binary classification
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x


In [10]:
model = PPIClassifier(input_dim).to(device)
print(model)


PPIClassifier(
  (fc1): Linear(in_features=3072, out_features=512, bias=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [11]:
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [12]:
num_epochs = 20

for epoch in range(num_epochs):
    # Training Phase
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device).unsqueeze(1)  # Reshape labels to (batch_size, 1)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Statistics
        train_loss += loss.item() * inputs.size(0)
        predicted = (outputs >= 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    # Calculate average loss and accuracy
    train_loss /= len(train_dataset)
    train_acc = correct / total
    
    # Validation Phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).unsqueeze(1)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item() * inputs.size(0)
            predicted = (outputs >= 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    
    val_loss /= len(val_dataset)
    val_acc = correct / total
    
    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float

In [None]:
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        test_loss += loss.item() * inputs.size(0)
        predicted = (outputs >= 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

test_loss /= len(test_dataset)
test_acc = correct / total

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Collect all predictions and labels
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        outputs = model(inputs)
        predicted = (outputs >= 0.5).float()
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Classification report
print(classification_report(all_labels, all_preds, digits=4))

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(cm)
