In [2]:
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import os
from PIL import Image
import torchvision.transforms as transforms

**Directories** 

Load data 'nd stuff

In [15]:
data_dir = '../data/'
peptides_file = data_dir + "binding_affinity.txt"
peptides = np.loadtxt(peptides_file, dtype=str).tolist()
peptides = [row[1:] for row in peptides if len(row[1]) == 9]


**One-hot encoding**

In [4]:
aa = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = dict((c, i) for i, c in enumerate(aa))

def encode_peptide(peptide):
    encoding = np.zeros((len(peptide), len(aa)))
    for i, AA in enumerate(peptide):
        encoding[i, aa_to_int[AA]] = 1
    return encoding

**Load AA depictions**

In [16]:
#store images in cache to save performance
image_cache = {}
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_AA_image(img_path):

    if img_path in image_cache:
        return image_cache[img_path]
    
    # Define transformation to do on image
    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    
    image = Image.open(img_path).convert('RGB')
    processed_image = transform(image).unsqueeze(0).to(device)
    
    image_cache[img_path] = processed_image
    
    return processed_image

In [17]:
amino_acid_full_names = {
    'A': 'alanine',
    'R': 'arginine',
    'N': 'asparagine',
    'D': 'aspartic_acid',
    'C': 'cysteine',
    'E': 'glutamic_acid',
    'Q': 'glutamine',
    'G': 'glycine',
    'H': 'histidine',
    'I': 'isoleucine',
    'L': 'leucine',
    'K': 'lysine',
    'M': 'methionine',
    'F': 'phenylalanine',
    'P': 'proline',
    'S': 'serine',
    'T': 'threonine',
    'W': 'tryptophan',
    'Y': 'tyrosine',
    'V': 'valine'
}

pixel_features = []

for letter, aa in amino_acid_full_names.items():
    # load and preprocess
    img_path = f'../data/2Dstruc/{aa}.png'
    image = load_AA_image(img_path)
    pixel_features.append(image.flatten())

pixel_features = np.vstack(pixel_features)

# PCA
#pca = PCA()
#pca_pixel_features = pca.fit_transform(pixel_features)

# Create dictionary with aa_name:PCA_feature
aa_features_dict = {}
for idx, aa in enumerate(amino_acid_full_names.keys()):
    aa_features_dict[aa] = pixel_features[idx, :]

**Data preparation for One hot encoding**

In [None]:
targets = []
encodings = []
for peptide, score in peptides:
    X = np.array([encode_peptide(amino_acid) for amino_acid in peptide])
    score = float(score)
    encodings.append(X)
    targets.append(float(score))

tensor_input = torch.stack([torch.tensor(arr) for arr in encodings])
tensor_input = tensor_input.squeeze(dim=2).float()  
targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1)  

# Split into training and evaluation sets (80/20 split)
eval_size = int(0.2 * len(encodings))
train_encodings, eval_encodings = tensor_input[:-eval_size], tensor_input[-eval_size:]
train_targets, eval_targets = targets[:-eval_size], targets[-eval_size:]

In [None]:
targets = []
encodings = []
for peptide, score in peptides:
    X = np.array([aa_features_dict[amino_acid] for amino_acid in peptide])
    score = float(score)
    encodings.append(X)
    targets.append(float(score))

tensor_input = torch.stack([torch.tensor(arr) for arr in encodings])
tensor_input = tensor_input.squeeze(dim=2).float()  
targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1)  

# Split into training and evaluation sets (80/20 split)
eval_size = int(0.2 * len(encodings))
train_encodings, eval_encodings = tensor_input[:-eval_size], tensor_input[-eval_size:]
train_targets, eval_targets = targets[:-eval_size], targets[-eval_size:]