# Cell-penetrating peptides (CPP) prediction

This notebook focus on linear peptides with all natural amino acids.

In [4]:
import pandas as pd
import numpy as np

train = False

# CPP924 dataset

In [5]:
df_train_cpp924 = pd.read_csv('data/CPP924/train.csv')
df_test_cpp924 = pd.read_csv('data/CPP924/test.csv')

In [6]:
y_train = df_train_cpp924.is_cpp.values
y_test = df_test_cpp924.is_cpp.values

In [24]:
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef, roc_auc_score

def get_metrics(y_hat, y_test):
    acc = accuracy_score(y_test, y_hat)
    sn = recall_score(y_test, y_hat)
    sp = recall_score(y_test, y_hat, pos_label=0)
    mcc = matthews_corrcoef(y_test, y_hat)
    auroc = roc_auc_score(y_test, y_hat)

    print(f'Acc(%) \t Sn(%) \t Sp(%) \t MCC \t AUROC')
    print(f'{acc*100:.2f}\t{sn*100:.2f}\t{sp*100:.2f}\t{mcc:.3f}\t{auroc:.3f}')


## Feature processing

### Fingerprints

In [17]:
import warnings
from rdkit import Chem, rdBase, DataStructs
from rdkit.Chem import AllChem
from typing import List

rdBase.DisableLog('rdApp.error')
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

def fingerprints_from_smiles(smiles: List, size=2048):
    """
        Create ECFP fingerprints of smiles, with validity check
    """
    fps = []
    valid_mask = []
    for i, smile in enumerate(smiles):
        mol = Chem.MolFromSmiles(smile)
        valid_mask.append(int(mol is not None))
        fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
        fps.append(fp)

    fps = np.concatenate(fps, axis=0)
    return fps, valid_mask


def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
    """
        Create ECFP fingerprint of a molecule
    """
    if hashed:
        fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
    else:
        fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
    fp_np = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
    return fp_np.reshape(1, -1)

In [18]:
# train = True
if train:
    X_train = fingerprints_from_smiles(df_train_cpp924.smi)[0]
    X_test = fingerprints_from_smiles(df_test_cpp924.smi)[0]

    np.save('data/CPP924/X_train_fps.npy', X_train)
    np.save('data/CPP924/X_test_fps.npy', X_test)
else:
    X_train = np.load('data/CPP924/X_train_fps.npy')
    X_test = np.load('data/CPP924/X_test_fps.npy')

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((733, 2048), (733,), (183, 2048), (183,))

### ESM-2 features

In [19]:
import torch
import esm

def load_esm2_model():
    # Load ESM-2 model
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
    batch_converter = alphabet.get_batch_converter()
    model.eval()  # disables dropout for deterministic results
    return model, batch_converter, alphabet

def get_esm_seq_representation(aa_seqs, batch_converter, model, alphabet):

    data = [(f"seq{id}", seq) for id, seq in enumerate(aa_seqs)]
    _, _, batch_tokens = batch_converter(data)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33]

    sequence_representations = []
    for i, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0)) # Take mean of non-pad tokens
    
    return torch.stack(sequence_representations).numpy()

In [20]:
# train = True
if train:
    model, batch_converter, alphabet = load_esm2_model()
    X_train = get_esm_seq_representation(df_train_cpp924.aa_seq, batch_converter, model, alphabet)
    X_test = get_esm_seq_representation(df_test_cpp924.aa_seq, batch_converter, model, alphabet)

    np.save('data/CPP924/X_train_esm2.npy', X_train)
    np.save('data/CPP924/X_test_esm2.npy', X_test)
else:
    X_train = np.load('data/CPP924/X_train_esm2.npy')
    X_test = np.load('data/CPP924/X_test_esm2.npy')

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((733, 1280), (733,), (183, 1280), (183,))

### SMILES BERT features

In [10]:
import yaml
import torch
import numpy as np
from models.bert import BERT
from datasets.smiles_dataset import SmilesTokenizer
from utils.utils import load_model

def load_smi_bert_model(ckpt='results/pretrain/bak3/model_10_0.070.pt', device='cuda'):
    tokenizer = SmilesTokenizer()
    model = BERT(tokenizer, context_length=400, width=512, n_heads=8, n_layers=6)
    model = load_model(model, ckpt, device)
    model.eval()
    return model, tokenizer, device

def get_smi_embd(smi_encoder, smiles, device='cuda'):
    smi_tokens_raw = smi_encoder.tokenizer.batch_encode(smiles).to(device)
    # smi_tokens = smi_encoder.process_inputs(smi_tokens_raw)
    smi_tokens = smi_encoder.process_batch(smi_tokens_raw)
    batch_lens = (smi_tokens != smi_encoder.tokenizer.pad_token_id).sum(1)
    smi_embd = smi_encoder.embed(smi_tokens)
    smi_reps = []
    for i, tokens_len in enumerate(batch_lens):
        smi_reps.append(smi_embd[i, 1 : tokens_len - 1].mean(0))
    
    return torch.stack(smi_reps)

def encode_smi(smi_list, tokenizer, device, model):
    with torch.no_grad():
        output= get_smi_embd(model, smi_list, device='cuda')
        embd = output.cpu().numpy() #.mean(axis=1)
    return embd

In [73]:
train = True
if train:
    # model, tokenizer, device = load_smi_bert_model(ckpt='results/pretrain/bak3/model_final_0.061.pt')  #79.78	84.69	74.12	0.593	0.794
    # model, tokenizer, device = load_smi_bert_model(ckpt='results/pretrain/bak3/model_10_0.070.pt')  #77.05	82.65	70.59	0.538	0.766
    # model, tokenizer, device = load_smi_bert_model(ckpt='results/pretrain/bak3/model_1_0.093.pt')  #75.41	76.53	74.12	0.506	0.753
    model, tokenizer, device = load_smi_bert_model(ckpt='results/pretrain/bak1/model_3_0.109.pt')  #73.22	76.53	69.41	0.461	0.730

    X_train = encode_smi(df_train_cpp924.smi, tokenizer, device, model)
    X_test = encode_smi(df_test_cpp924.smi, tokenizer, device, model)

    np.save('data/CPP924/X_train_smi_bert.npy', X_train)
    np.save('data/CPP924/X_test_smi_bert.npy', X_test)
else:
    X_train = np.load('data/CPP924/X_train_smi_bert.npy')
    X_test = np.load('data/CPP924/X_test_smi_bert.npy')

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((733, 512), (733,), (183, 512), (183,))

## Classification

In [74]:
# features = ['fps', ]
# features = ['esm2', ] 
features = ['smi_bert', ] 
# features = ['molebert', ] 
# features = ['esm2', 'fps', ]  
# features = ['esm2', 'smi_bert', ]
# features = ['esm2', 'fps', 'smi_bert']

X_train_features = []
X_test_features = []
for feat in features:
    if feat == 'esm2':
        X_train = np.load('data/CPP924/X_train_esm2.npy')
        X_test = np.load('data/CPP924/X_test_esm2.npy')
        
        X_train_features.append(X_train)
        X_test_features.append(X_test)
    elif feat == 'fps':
        X_train = np.load('data/CPP924/X_train_fps.npy')
        X_test = np.load('data/CPP924/X_test_fps.npy')

        X_train_features.append(X_train)
        X_test_features.append(X_test)
    elif feat == 'smi_bert':
        X_train = np.load('data/CPP924/X_train_smi_bert.npy')
        X_test = np.load('data/CPP924/X_test_smi_bert.npy')

        X_train_features.append(X_train)
        X_test_features.append(X_test)
    elif feat == 'molebert':
        X_train = np.load('data/CPP924/X_train_molebert.npy')
        X_test = np.load('data/CPP924/X_test_molebert.npy')

        X_train_features.append(X_train)
        X_test_features.append(X_test)
    else:
        raise ValueError(f'Feature {feat} not supported')

X_train = np.concatenate(X_train_features, axis=1)
X_test = np.concatenate(X_test_features, axis=1)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((733, 512), (733,), (183, 512), (183,))

### Random Forest

In [76]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

get_metrics(y_hat, y_test)

# Features      Acc(%) 	 Sn(%) 	 Sp(%) 	 MCC 	 AUROC
# FPS:          86.89	91.84	81.18	0.738	0.865
# ESM:          93.44	95.92	90.59	0.869	0.933 
#               92.90	95.92	89.41	0.858	0.927
#               93.44	93.88	92.94	0.868	0.934*
# SMI:          90.16	91.84	88.24	0.802	0.900
#               90.16	90.82	89.41	0.802	0.901
#               91.80	93.88	89.41	0.835	0.916
#               87.43	92.86	81.18	0.749	0.870
# Mole-BERT:    51.91	45.92	58.82	0.048	0.524


# ESM+FPS:      93.99	94.90	92.94	0.879	0.939
#               91.80	92.86	90.59	0.835	0.917
#               92.90	94.90	90.59	0.857	0.927
#               94.54	96.94	91.76	0.891	0.944*
#               93.44	94.90	91.76	0.868	0.933
# ESM+SMI:      93.44	95.92	90.59	0.869	0.933
#               92.35	94.90	89.41	0.847	0.922
#               93.99	95.92	91.76	0.879	0.938

# ESM+SMI+FPS:  93.99	95.92	91.76	0.879	0.938

Acc(%) 	 Sn(%) 	 Sp(%) 	 MCC 	 AUROC
73.22	76.53	69.41	0.461	0.730
