# **Beta-Lactamase Data**

Open Bioinformatics Research Project

Chanin Nantasenamat

For a full explanation of the first open bioinformatics research project, see this video on the Data Professor channel.

[Data Professor YouTube channel](https://youtube.com/dataprofessor)

In [None]:
! wget https://github.com/dataprofessor/beta-lactamase/raw/main/beta_lactamase_CHEMBL29.zip

In [None]:
! unzip beta_lactamase_CHEMBL29.zip

In [None]:
! ls *.csv | wc -l

In [None]:
import zipfile
import pandas as pd

zf = zipfile.ZipFile("beta_lactamase_CHEMBL29.zip", "r")
df = pd.concat( (pd.read_csv( zf.open(f) ) for f in zf.namelist() ) )

In [None]:
df

In [None]:
# Bar plot of Missing vs Non-Missing Data

import matplotlib.pyplot as plt

# Data
missing = df.pchembl_value.isnull().sum()
nonmissing = df.pchembl_value.notnull().sum()

x = ['Missing', 'Non-Missing']
y = [missing, nonmissing]

# Setup plot
fig, ax = plt.subplots()

# Make bar plot
p = ax.bar(x, y, color = ['#F8766D', '#00BFC4'], ec = 'black')

ax.set_title('pChEMBL Missing Data', fontsize=14, fontweight='bold', pad=15)
#ax.set_xticklabels(x, fontweight='bold')

ax.set_ylim(0,70000)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Label above bar
for index, data in enumerate(y):
    plt.text(x=index-0.1 , y =data+1000 , s=f"{data}" , fontdict=dict(fontsize=14))

fig.set_size_inches(5,4.5)
plt.show()

In [None]:
# Non-missing data with pChEMBL value
df2 = df[df.pchembl_value.notnull()]
df2

In [None]:
import numpy as np

y = np.log10(df["standard_value"])

print(y)

In [None]:
df2.groupby('molecule_chembl_id').mean()

In [None]:
df2.groupby('molecule_chembl_id').std()

In [None]:
df3 = df2.groupby('molecule_chembl_id').std()
df3[(df3.pchembl_value < 2) & (df3.pchembl_value == pd.NA)]

In [None]:
import numpy as np

df3 = df2.groupby('molecule_chembl_id').std()
df3[df3.pchembl_value == np.nan]

In [None]:
print('Number of unique ChEMBL ID:', str(len(df.molecule_chembl_id.unique()) )  )
print('Total number of ChEMBL ID: ', str(len(df)) )
print('Number of missing ChEMBL ID: ', str(df.molecule_chembl_id.isnull().sum()) )

In [None]:
import matplotlib.pyplot as plt

unique = len(df.molecule_chembl_id.unique())
not_unique = len(df) - unique

x = ['Unique', 'Redundant']
y = [unique, not_unique]


#sema added
#Make a plot comparing the unique and the redundant data
# a simple line plot
plt.figure(figsize = (2, 3))

# creating the bar plot
plt.bar(x, y, color ='#00BFC4', width = 0.4, ec='black')
plt.title("Comparing the unique and the redundant data")

plt.show()


In [None]:
df2.target_pref_name.value_counts()[0:50].plot.bar(figsize=(24,4), color='#00BFC4', ec='black')

plt.title('Top 50 Targets', fontsize=14, fontweight='black', pad=15)
plt.show()

In [None]:
# Top 10 Bioactivity units
df2.standard_type.value_counts()[:10].plot.bar(figsize=(8,4), color='#00BFC4', ec='black')

plt.title('Top Bioactivity Units', fontsize=14, fontweight='black', pad=15)
plt.show()

In [None]:
bao_labels = df2.bao_label.value_counts()
bao_labels.plot.bar(figsize=(8,4), color='#00BFC4', ec='black')

plt.title('Histogram of BioAssay Ontology', fontsize=14, fontweight='black', pad=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
bao_labels

In [None]:
df2.pchembl_value.hist(bins=40, figsize=(8,4), color='#00BFC4', ec='black')

plt.title('Histogram of pChEMBL values', fontsize=14, fontweight='black', pad=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
df2

In [None]:
#pChEMBL values <=6 == 'Inactive' pChEMBL values > 6 == 'Active'
# create a list of our conditions
conditions = [(df2['pchembl_value'] <=6 ), (df2['pchembl_value'] > 6)]

# create a list of the values we want to assign for each condition
values = ['Inactive', 'Active']

# create a new column and use np.select to assign values to it using our lists as arguments
df2['Activity'] = np.select(conditions, values)

# display updated DataFrame
df2.head()

**Install padelpy**

In [None]:
! pip install padelpy

# **Prepare fingerprint XML**

**Download fingerprint XML files**

In [None]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

**List and sort fingerprint XML files**

In [None]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

In [None]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

**Create a dictionary**

In [None]:
fp = dict(zip(FP_list, xml_files))
fp

# **Prepare data subset as input to PaDEL**

In [None]:
df2

In [None]:
df3 = pd.concat( [df2['canonical_smiles'],df2['molecule_chembl_id']], axis=1 )
df3.to_csv('molecule.smi', sep='\t', index=False, header=False)
df3

# **Calculate descriptors**
There are 12 fingerprint types in PaDEL. To calculate all 12, make sure to make adjustments to the descriptortypes input argument to any of the ones in the fp dictionary variable as shown above, e.g. SubstructureFingerprintCount.xml

In [None]:
fp

In [None]:
fp['PubChem']

In [None]:
from padelpy import padeldescriptor

#fingerprint = 'Substructure'

fingerprint = 'CDK'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [None]:
from padelpy import from_smiles

# only calculate fingerprints
fingerprints = from_smiles('C[C@]1(Cn2ccnn2)[C@H](C(=O)O)N2C(=O)C[C@H]2S1(=O)=O', fingerprints=True, descriptors=False)


# **Display calculated fingerprints**




In [None]:
descriptors = pd.read_csv(fingerprint_output_file)
descriptors

# **Build Multiple Machine Learning Models**

In [None]:
df2

In [None]:
X = descriptors.drop('Name', axis=1)
y = df2['Activity']

In [None]:
X

In [None]:
y.count

**Remove low variance features**

In [None]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.1)
X