## Topological featurization enables robust classification of protein-peptide interactions

In [1]:
import os
import sys

import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, NumberOfPoints, Amplitude
from gtda.plotting import plot_diagram, plot_point_cloud

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

from sklearn import metrics, svm, ensemble, covariance, model_selection

import featurize_pdb
import utils_occ_model

#### Set Paths and Variables

In [3]:
PATH = './'
PATH_PEPTIDE_LIST = os.path.join(PATH, 'Dataset', 'peptide_list.csv')
PATH_DATA = os.path.join(PATH, 'Dataset', 'pepbdb')

In [4]:
NON_CANONICAL_PEPTIDE = [0]
MIN_RES= 0
MAX_RES= 5

SEED=108
SAMPLE = 100

#### Sequence Dataset Loading and Filtering

In [5]:
df_peptide_list = pd.read_csv(PATH_PEPTIDE_LIST)

df_peptide_list = df_peptide_list[
    df_peptide_list['noncanonical_peptide'].isin(NON_CANONICAL_PEPTIDE)]

In [6]:
df_peptide_list = df_peptide_list.sample(
    SAMPLE, random_state=SEED).reset_index(drop=True)

#### Sequence Featurization

In [7]:
featurize = featurize_pdb.Featurize_PDB()

Featurize PDB as Coulomb Matrix, Point Cloud, and Protein Parameters


In [8]:
peptide_feature_types = ['Coulomb', 'PC', 'PC-Mass']

In [9]:
%%capture
receptor_features = []
peptide_param_features = []

peptide_raw_features = {feature:[] for feature in peptide_feature_types}
for idx in range(df_peptide_list.shape[0]):
    pdb_chain_id = ''.join((
        df_peptide_list.iloc[idx]['PDB_ID'], '_', 
        df_peptide_list.iloc[idx]['peptide_chain_ID']))
    
    path_folder = os.path.join(PATH_DATA, pdb_chain_id)
    try:
        tmp_peptide_feature = {}
        for feature in peptide_raw_features:
            tmp_peptide_feature[feature] = featurize.featurize(path_folder, 'peptide.pdb', feature)

        tmp_receptor_feature = featurize.featurize(path_folder, 'receptor.pdb', 'Parameters')
        tmp_peptide_param_feature = featurize.featurize(path_folder, 'peptide.pdb', 'Parameters')

    except: 
        continue
    
    for feature in peptide_raw_features:
        peptide_raw_features[feature].append(tmp_peptide_feature[feature])
    receptor_features.append(tmp_receptor_feature)
    peptide_param_features.append(tmp_peptide_param_feature)

In [10]:
dict_receptor_features, dict_peptide_features = {}, {}

for feature in peptide_raw_features:
    dict_receptor_features[feature], dict_peptide_features[feature] = utils_occ_model.filter_dataset(
        receptor_features, peptide_raw_features[feature])

#### TDA Featurization for peptides

In [None]:
persistence_diagrams = {}

for feature in peptide_feature_types:
    metric = 'precomputed' if feature is 'Coulomb' else 'euclidean'
    VR = VietorisRipsPersistence(metric=metric, homology_dimensions=[0, 1, 2], n_jobs=-1)
    persistence_diagrams[feature] = VR.fit_transform(dict_peptide_features[feature])

In [None]:
persistence_entropy = {}

for feature in peptide_feature_types:
    PE = PersistenceEntropy()
    persistence_entropy[feature] = PE.fit_transform(persistence_diagrams[feature])

In [None]:
number_of_points = {}
for feature in peptide_feature_types:
    for diagram in persistence_diagrams[feature]:
        number_of_points[feature] = NumberOfPoints().fit_transform(
            persistence_diagrams[feature])

In [None]:
amplitude = {}
for feature in peptide_feature_types:
    for diagram in persistence_diagrams[feature]:
        amplitude[feature] = Amplitude(metric='wasserstein').fit_transform(
            persistence_diagrams[feature])

#### Train, Test Splits

In [None]:
X_train, X_test = {}, {}

for feature in peptide_feature_types:
    X_train[feature], X_test[feature], _, _ = utils_occ_model.occ_feature_preprocess(
        X=[persistence_entropy[feature], 
           dict_receptor_features[feature], 
           number_of_points[feature],
           amplitude[feature]])

In [None]:
X_train_params, X_test_params, _, _ = utils_occ_model.occ_feature_preprocess(
        X=[receptor_features, peptide_param_features])

#### Model training

In [None]:
utils_occ_model.occ_training(X_train_params, 'svm')

In [None]:
utils_occ_model.occ_training(X_train_params, 'isoforest')

In [None]:
utils_occ_model.occ_training(X_train['PC'], 'svm')

In [None]:
utils_occ_model.occ_training(X_train['PC'], 'isoforest')

In [None]:
utils_occ_model.occ_training(X_train['Coulomb'], 'svm')

In [None]:
utils_occ_model.occ_training(X_train['Coulomb'], 'isoforest')

In [None]:
utils_occ_model.occ_training(X_train['PC-Mass'], 'svm')

In [None]:
utils_occ_model.occ_training(X_train['PC-Mass'], 'isoforest')