In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import rdMolDescriptors

In [None]:
traindf = pd.read_csv("train-final.csv")
testdf = pd.read_csv("holdout-final.csv")
peptidedf = pd.read_csv("peptide-final.csv")

In [None]:
def FPBitVec( fps ):
    X = []
    for item in fps:
        bv = DataStructs.ExplicitBitVect(4096)
        DataStructs.ExplicitBitVect.FromBase64(bv, item)
        X.append(bv)
    return X

xtrain = FPBitVec(traindf.ECFP6)
xtest = FPBitVec(holdoutdf.ECFP6)
xpeptide = FPBitVec(peptidedf.ECFP6)

train_test_max_similarity = [max(DataStructs.BulkTanimotoSimilarity(fps,xtrain)) for fps in xtest]
train_peptide_max_similarity = [max(DataStructs.BulkTanimotoSimilarity(fps,xtrain)) for fps in xpeptide]

In [None]:
plt.hist(train_test_max_similarity,bins=50)
plt.xlabel("Similarity",size=15)
plt.ylabel("Count",size=15)
plt.xticks([0,.2,.4,.6,.8,1])
plt.ylim(0,600)
plt.show()

plt.hist(train_peptide_max_similarity,bins=50)
plt.xlabel("Similarity",size=15)
plt.ylabel("Count",size=15)
plt.xticks([0,.2,.4,.6,.8,1])
plt.ylim(0,600)
plt.show()

In [None]:
# Number of rotatable bonds in both training and testing sets
trainmol = [Chem.MolFromSmiles(smi) for smi in traindf.SMILES]
testmol = [Chem.MolFromSmiles(smi) for smi in testdf.SMILES]
peptidemol = [Chem.MolFromSmiles(smi) for smi in peptidedf.SMILES]
train_rotor = [rdMolDescriptors.CalcNumRotatableBonds(mol, strict=False) for mol in trainmol]
test_rotor = [rdMolDescriptors.CalcNumRotatableBonds(mol, strict=False) for mol in testmol]
peptide_rotor = [rdMolDescriptors.CalcNumRotatableBonds(mol, strict=False) for mol in peptidemol]

trainv, trainc = np.unique(train_rotor, return_counts=True)
testv, testc = np.unique(test_rotor, return_counts=True)
pepv,pepc = np.unique(peptide_rotor, return_counts=True)

plt.bar(trainv, trainc)
plt.xticks(list(range(0,20,2)))
plt.ylabel("Count",size=15)
plt.xlabel("Number of Rotatable Bonds",size=15)
plt.show()


plt.bar(testv, testc)
plt.xticks(list(range(0,20,2)))
plt.ylabel("Count",size=15)
plt.xlabel("Number of Rotatable Bonds",size=15)
plt.show()


plt.bar(pepv, pepc)
plt.xticks(list(range(0,20,2)))
plt.ylabel("Count",size=15)
plt.xlabel("Number of Rotatable Bonds",size=15)
plt.show()