In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
! pip install pandas
! pip install numpy
! pip install matplotlib
! pip install pandas-summary
! pip install tensorflow
#! pip install -q tensorflow-model-optimization
! pip install tensorflow-addons
! pip install sklearn-pandas
! pip install mlflow



In [6]:
! pip install chembl-webresource-client
! pip install deepchem
! pip install seaborn



# Proyecto final Bioinformática (NLP)

mirar el siguiente video. Sobre todo la primer parte como para tener contexto sobre descubrimiento de drogas:

https://www.youtube.com/watch?v=jBlTQjcKuaY

Resumen y puntos importantes del video:
- Queremos entender la bioactividad de una molecúla (molecule_chembl_id) sobre una encima (Acetylcholinesterase)
- La bioactividad se medirá en este caso con el IC50 (standard_value)
- A menor IC50, menos droga para generar la misma actividad, es decir, mayor actividad relativa
- La notación de la fórmula química se llama smiles (https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system)
- Existen distintas técnicas para obtener features de las moléculas y en el video se describen 2:
    - Descriptores de Lipinski
    - Fingerprints del tipo pubchem
- Se construye un modelo de regresión con RandomForest para estimar el pI50 (IC50 en escala logarítimica) dado los fingerprints de entrada

### Objetivos del proyecto:
- Evaluar distintas alternativas de modelos de deep learning para resolver el problema
    - LSTM
    - CNN
    - TextCNN
- En vez de ingresar con los features de entrada (fingerprints) como en el video, usar técnicas de embeddings aplicadas en NLP
    - Modo caracter dado el smiles de la fórmula química
    - Utilizando un tokenizer sobre los smiles
- La salida será el pIC50

### Librerías:
- chembl-webresource-client: Para bajar el dataset (https://pypi.org/project/chembl-webresource-client/)
- deepchem: libería muy interesante con muchas implementaciones de deep learning aplicadas a la química (https://github.com/deepchem/deepchem)

In [7]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Cliente API

En esta notebook solo se baja el dataset. No tiene que hacer nada más que ejecutarla y entenderla

Librería para baja el dataset

In [8]:
target = new_client.target
target_query = target.search('acetylcholinesterase')
targets = pd.DataFrame.from_dict(target_query)

In [9]:
targets.head()

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P22303', 'xref_name': None, 'xre...",Homo sapiens,Acetylcholinesterase,27.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Cholinesterases; ACHE & BCHE,27.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606
2,[],Drosophila melanogaster,Acetylcholinesterase,18.0,False,CHEMBL2242744,"[{'accession': 'P07140', 'component_descriptio...",SINGLE PROTEIN,7227
3,[],Bemisia tabaci,AChE2,16.0,False,CHEMBL2366409,"[{'accession': 'B3SST5', 'component_descriptio...",SINGLE PROTEIN,7038
4,[],Leptinotarsa decemlineata,Acetylcholinesterase,16.0,False,CHEMBL2366490,"[{'accession': 'Q27677', 'component_descriptio...",SINGLE PROTEIN,7539


In [10]:
selected_target = targets.target_chembl_id[0]
selected_target
# 'CHEMBL220'

'CHEMBL220'

In [11]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

# Bajada de data
Puede tardar un poco dependiendo de que tan saturado este el server

Por eso el for, para ver el progreso y bajar la ansiedad. Son en el orden de 7500K

In [12]:
res_cols = []
for i, r in enumerate(res):
    print(f'{i}\r', end='')
    res_cols.append(r)

8394

In [13]:
df = pd.DataFrame(res_cols)

In [14]:
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8390,,23311841,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4844259,Inhibition of human recombinant AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,1.07
8391,Not Determined,23311842,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4844259,Inhibition of human recombinant AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,,,,
8392,,23311843,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4844259,Inhibition of human recombinant AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,26.8
8393,Not Determined,23311844,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4844259,Inhibition of human recombinant AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,,,,


# Limpio data

In [15]:
df = df.dropna(subset=['standard_value', 'canonical_smiles'])
df = df.drop_duplicates(['canonical_smiles'])

In [16]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df = df[selection]

In [17]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0
...,...,...,...
8384,CHEMBL4859103,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCCNC(=O)c4cc(...,2.57
8385,CHEMBL4863615,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)Nc4cn[n...,2.39
8386,CHEMBL4854913,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,0.41
8387,CHEMBL4848527,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)NCc4cc[...,0.63


# Preprocesamiento y normalización

In [18]:
import numpy as np

In [19]:
df['standard_value'] = df['standard_value'].apply(pd.to_numeric)

In [20]:
# En el video se hace esta normalización. No la veo del todo necesaria
df['standard_value_norm'] = df['standard_value'].apply(lambda x: (x>1e8)*1e8 + (x<=1e8)*x)

In [21]:
df['pIC50'] = df['standard_value'].apply(lambda x: -np.log10(x*(10**-9)))

In [22]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.00,750.00,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.00,100.00,7.000000
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.00,50000.00,4.301030
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.00,300.00,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.00,800.00,6.096910
...,...,...,...,...,...
8384,CHEMBL4859103,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCCNC(=O)c4cc(...,2.57,2.57,8.590067
8385,CHEMBL4863615,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)Nc4cn[n...,2.39,2.39,8.621602
8386,CHEMBL4854913,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,0.41,0.41,9.387216
8387,CHEMBL4848527,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)NCc4cc[...,0.63,0.63,9.200659


# Mas analisis

In [23]:
df['pIC50'].describe()
df['pIC50'].max() ,df['pIC50'].argmax()

(inf, 5462)

In [24]:
df['pIC50'].isna()
df['pIC50'].dropna()

0       6.124939
1       7.000000
2       4.301030
3       6.522879
4       6.096910
          ...   
8384    8.590067
8385    8.621602
8386    9.387216
8387    9.200659
8388    8.903090
Name: pIC50, Length: 5824, dtype: float64

In [25]:
df['pIC50'].max() ,df['pIC50'].argmax()

(inf, 5462)

In [26]:
#borrar el que tiene infinito
df = df.drop(df[np.isinf(df.pIC50)].index).copy()

# guardar archivo

In [27]:
! mkdir data

In [28]:
df.to_csv('data/acetylcholinesterase_02_bioactivity_data_preprocessed.csv', index=False)

In [29]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

date and time = 01/02/2023 20:33:05
