<a href="https://colab.research.google.com/github/eh329/Bioinformatcs-Project/blob/main/DrugDiscoverywithPython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Computational Drug Discovery 

### Using ChEMBL Database

In [None]:
# Installing libraries

!pip install chembl_webresource_client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Importing libraries

import pandas as pd

from chembl_webresource_client.new_client import new_client

## 1 - Preprocessing

In [None]:
# Selection of active compound
# Target seach for active compund

def target_search(act_com):
  """
  Searches ChEMBL database for target protein(s)

  Parameters
  -----------
  act_com: String, name of active compound whose potential targets the user is looking for

  Returns
  -----------
  Pandas dataframe of targets, None if there is not any in the database
  """
  target = new_client.target
  target_query = target.search(act_com)
  targets = pd.DataFrame(target_query)

  if len(targets) > 0:
    return targets

  else:
    return f"No target found!"

# Select and retrieve the bioacitivity for a specific entry

def standard_type(target_index):
  """
  Searches for the activity of a selected target from 
  target_search function.

  Parameters
  -----------
  target_index: Int, number of index for the specific target
                in the target_search fucntion result. 

  Returns
  -----------
  A list, if more than one type, or a string in case of one
  """
  selected_target = target_res.target_chembl_id[target_index]
  activity = new_client.activity
  return activity[0]["standard_type"]


def bioactivity(target_index, standard = "IC50", save = False):
  """
  Searches for the activity of a selected target from 
  target_search function.

  Parameters
  -----------
  target_index: Int, number of index for the specific target
                in the target_search fucntion result. 

  standard: String, the result from standard_type function,
            IC50 by default.
  save: Boolean, if True, saves a csv file in the directory.
        If False, returns the dataframe result. False by defualt.

  Returns
  -----------
  Pandas dataframe of activities for the specific targget from ChEMBL
  """
  selected_target = target_res.target_chembl_id[target_index]
  activity = new_client.activity 
  res = activity.filter(target_chembl_id = selected_target).filter(standard_type = standard)
  df = pd.DataFrame(res)

  if save == False:
    return df

  else:
    df.to_save("bioactivity_data.csv", index = False)


# Labeling compunds into three different classes:
#                                  active: compunds with value less than 1000 nM
#                                  inactive: compounds with value more than 10000 nM
#                                  intermediate: compunds with value between 1000 - 10000 nM
# This is helpfull for preparing ML model later.

def bioactivity_class(num):
  """
  Takes a number and labels it based on the conditions

  Parameters
  -----------
  num: Int, a number in Pandas series

  Returns
  -----------
  One of the three strings in a particular cell in Pandas series

  """
  if float(num) >= 10000:
    return "inactive"

  elif float(num) <= 1000:
    return "active"

  else:
    return "intermediate"
  

In [None]:
target_res = target_search("coronavirus")
target_res.head()

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859


In [None]:
data = bioactivity(4)
data.head()

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0


In [None]:
# Handling missing values
# Labeling
# Creating a subset

data = data[data["standard_value"].notna()]
data["bioactivity_class"] = data["standard_value"].apply(bioactivity_class)
final_df = data[["molecule_chembl_id", "canonical_smiles",
                 "bioactivity_class", "standard_value"]]