# Exploratory Data Analysis for ACPPred

In [9]:
!curl -o ../data/raw/positive.txt https://webs.iiitd.edu.in/raghava/anticp2/pos_train_alternate
!curl -o ../data/raw/negative.txt https://webs.iiitd.edu.in/raghava/anticp2/neg_train_alternate

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 17126  100 17126    0     0   8229      0  0:00:02  0:00:02 --:--:--  8257
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 22467  100 22467    0     0   8650      0  0:00:02  0:00:02 --:--:--  8671


In [12]:
from Bio.SeqUtils import ProtParam
import pandas as pd

peptide_data = []

with open('../data/raw/positive.txt') as reader:
    for peptide in reader:
        aa_composition = ProtParam.ProteinAnalysis(peptide).get_amino_acids_percent()
        aa_composition['label'] = 1
        peptide_data.append(aa_composition)
        
with open('../data/raw/negative.txt') as reader:
    for peptide in reader:
        aa_composition = ProtParam.ProteinAnalysis(peptide).get_amino_acids_percent()
        aa_composition['label'] = 0
        peptide_data.append(aa_composition)
        
df_peptides = pd.DataFrame(peptide_data)
df_peptides

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,label
0,0.000000,0.133333,0.000000,0.000000,0.133333,0.066667,0.000000,0.066667,0.133333,0.200000,...,0.000000,0.066667,0.066667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
1,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.166667,0.000000,1
2,0.055556,0.000000,0.000000,0.000000,0.111111,0.111111,0.000000,0.111111,0.055556,0.166667,...,0.000000,0.055556,0.000000,0.000000,0.222222,0.000000,0.055556,0.000000,0.000000,1
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.058824,0.000000,0.235294,0.294118,0.058824,...,0.058824,0.000000,0.000000,0.000000,0.000000,0.000000,0.176471,0.058824,0.000000,1
4,0.115385,0.000000,0.000000,0.000000,0.076923,0.153846,0.153846,0.153846,0.076923,0.076923,...,0.000000,0.000000,0.000000,0.115385,0.000000,0.000000,0.000000,0.038462,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,0.212766,0.085106,0.042553,0.000000,0.000000,0.127660,0.000000,0.021277,0.021277,0.106383,...,0.021277,0.042553,0.021277,0.021277,0.063830,0.042553,0.085106,0.021277,0.000000,0
1548,0.068966,0.000000,0.068966,0.034483,0.068966,0.034483,0.034483,0.034483,0.068966,0.103448,...,0.000000,0.068966,0.068966,0.000000,0.172414,0.103448,0.000000,0.034483,0.000000,0
1549,0.000000,0.000000,0.028571,0.000000,0.000000,0.028571,0.000000,0.028571,0.228571,0.057143,...,0.171429,0.028571,0.000000,0.000000,0.342857,0.000000,0.057143,0.000000,0.000000,0
1550,0.068182,0.022727,0.000000,0.068182,0.022727,0.159091,0.068182,0.022727,0.022727,0.113636,...,0.000000,0.113636,0.045455,0.022727,0.022727,0.068182,0.090909,0.000000,0.045455,0


In [14]:
X = df_peptides.drop(['label'], axis=1)
y = df_peptides['label']

In [16]:
from sklearn.decomposition import PCA
from plotly import express as ex

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

df_peptides_pca = pd.DataFrame(X_pca, columns=['pca_1', 'pca_2'])
df_peptides_pca['label'] = y.astype(str)

ex.scatter(df_peptides_pca, x='pca_1', y='pca_2', color='label')

In [17]:
from sklearn.metrics import silhouette_score

silhouette_score(X_pca, y)

0.13833452938361082

In [18]:
from umap import UMAP

umap = UMAP(n_components=2)
X_umap = umap.fit_transform(X)

df_peptides_umap = pd.DataFrame(X_umap, columns=['umap_1', 'umap_2'])
df_peptides_umap['label'] = y.astype(str)

ex.scatter(df_peptides_umap, x='umap_1', y='umap_2', color='label')

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [19]:
silhouette_score(X_umap, y)

0.23146616

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       195
           1       0.91      0.83      0.87       193

    accuracy                           0.88       388
   macro avg       0.88      0.88      0.88       388
weighted avg       0.88      0.88      0.88       388



In [27]:
def predict_anticancer_peptide(peptide_sequence):
    
    aa_composition = ProtParam.ProteinAnalysis(peptide_sequence).get_amino_acids_percent()
    X = pd.DataFrame([aa_composition])
    return model.predict_proba(X)[0][1]

predict_anticancer_peptide('WWE')

0.29532116901963285