<a href="https://colab.research.google.com/github/dapivei/riiaa/blob/master/Prediccion_de_Grafos_y_attribucion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download colab_utils and import



<a href="https://colab.research.google.com/github/beangoben/gnn_workshop_riiaa/blob/master/Prediccion de Nodos con Arxiv MAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [None]:
!wget https://raw.githubusercontent.com/beangoben/workshop_template/master/colab_utils.py -O colab_utils.py
!rm -rf sample_data
import colab_utils

Clone github repo, install stuff

In [None]:
github_repo = 'https://github.com/beangoben/ML_DL_con_moleculas'
colab_utils.clone_repo(github_repo)
colab_utils.conda_install_from_yaml()
colab_utils.pip_install_from_yaml()

# Import modules


In [None]:
import os
from collections import OrderedDict, defaultdict

import tqdm.auto as tqdm
import colab_utils
if colab_utils.IN_COLAB:
    colab_utils.add_conda_dir_to_python_path()

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

import sklearn
import umap
import tensorflow as tf
import sonnet as snt
import graph_nets as gn

import rdkit
import rdkit.Chem
import rdkit.Chem.AllChem as Chem
from rdkit.Chem import AllChem, Draw
import rdkit.Chem.rdFingerprintGenerator as rdFP
from rdkit.Chem.Draw import IPythonConsole

colab_utils.print_module_versions([umap, tf, snt, rdkit])
print(f'Tiene GPU? {tf.config.list_physical_devices("GPU")}')
colab_utils.matplotlib_settings()

# Data wrangling
## Delaney solubility dataset con 🐼s

In [None]:
df = pd.read_csv('data/delaney-processed.csv')
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)
print(df.shape)
print(df.columns)
df.head()

## Visualizar moleculas

In [None]:
mols = df['mol'].sample(n=9).tolist()
Draw.MolsToGridImage(mols)

## Construir train-test split

In [None]:
indices = np.array(df.index)
mols = np.array(df['mol'].tolist())
train_index, test_index = sklearn.model_selection.train_test_split(indices, test_size=.20)
mols_train, mols_test = mols[train_index], mols[test_index]
print(len(train_index),len(test_index))

# Propiedad de interes ($y$)

In [None]:
target = "measured log solubility in mols per litre"
y_true = df[target].values.reshape(-1,1).astype(np.float32)
y_train = y_true[train_index]
y_test = y_true[test_index]
print(y_train.shape, y_test.shape)

sns.distplot(y_true)
sns.distplot(y_train)

### Extra: Preprocesador

In [None]:
from sklearn.preprocessing import StandardScaler


y_preproc = StandardScaler()
y_new = y_preproc.fit_transform(y_true)

sns.distplot(y_new)

## Metricas (Que tan bien es mi modelo?)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error


def evaluate(y_true, y_pred, prefix=''):
    stats = OrderedDict([(f'{prefix}MAE', mean_absolute_error(y_test,y_test)), 
                         (f'{prefix}R2', r2_score(y_test,y_test))
                         ])
    return stats

print(evaluate(y_true, y_true))

In [None]:
def plot_preds(y_true, y_pred):
    plt.figure(figsize=(12,8))
    plt.scatter(y_true, y_pred)
    plt.legend()
    plt.show()
    
plot_preds(y_true, y_true)

# Inputs (x): Representaciones molecules

## Representaciones topologicas (fingerprints)

In [None]:
fps =  [Chem.GetMorganFingerprintAsBitVect(m,2,nBits=2048) for m in mols]
fps = np.vstack(np.array(fps))
fps_train, fps_test = fps[train_index], fps[test_index]
print(fps_train.shape, fps_test.shape)

## Similitud estructural

In [None]:
import scipy.spatial.distance as spdist

dist01 = spdist.jaccard(fps[0],fps[1])



## Vecinos estructurales

## Representaciones quimo-informaticas (mordred)

In [None]:
from mordred import Calculator, descriptors

calc = Calculator(descriptors, ignore_3D=True)
cheminfo = np.array([calc(m) for m in tqdm.tqdm(mols)])
cheminfo_train,  cheminfo_test = cheminfo[train_index], cheminfo[test_index]
print(cheminfo_train.shape, cheminfo_test.shape)

## Comparativa geometrica de las representaciones

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca_pipe = Pipeline([('scaler', StandardScaler()),
                                  ('dim_reduce', PCA(2))])

umap_pipe = Pipeline([('scaler', StandardScaler()),
                                  ('dim_reduce', umap.UMAP())])


x_umap = umap_pipe.fit_transform(fps)
x_pca = pca_pipe.fit_transform(fps)

print(x_umap.shape, x_pca.shape)

In [None]:
plt.scatter(x_umap[:,0], x_umap[:, 1],
            c=y_true,
            cmap='viridis',
            s=10, alpha=0.5)
plt.show()

In [None]:
vis_df = pd.DataFrame()
vis_df['UMAP1'] = x_umap[:, 0]
vis_df['UMAP2'] = x_umap[:, 1]
vis_df['PC1'] = x_pca[:, 0]
vis_df['PC2'] = x_pca[:, 1]
vis_df['label'] = y_true.ravel()
vis_df['index'] = np.arange(len(df))
vis_df['name'] = df['Compound ID'].tolist()
vis_df

In [None]:
alt.Chart(vis_df).mark_circle(size=10).encode(
    x='UMAP1:Q',
    y='UMAP2:Q',
    color=alt.Color('label:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['label', 'index', 'name']
).interactive()

In [None]:
brush = alt.selection(type='interval', resolve='global')

scatter1 = alt.Chart(vis_df).mark_circle(size=4).encode(
    x='PC1:Q',
    y='PC2:Q',
    color = alt.condition(brush, alt.Color('label:Q', scale=alt.Scale(scheme='viridis')), alt.value('lightgray')),
    tooltip=['label', 'index', 'name']
).add_selection(brush)

scatter2 = alt.Chart(vis_df).mark_circle(size=4).encode(
    x='UMAP1:Q',
    y='UMAP2:Q',
    color = alt.condition(brush, alt.Color('label:Q'), alt.value('lightgray')),
    tooltip=['label', 'index', 'name']
).add_selection(brush)

scatter1 | scatter2

## Grafos moleculares

In [None]:
possible_atom_list = ['S', 'Si', 'F', 'O',
                      'C', 'I', 'P', 'Cl', 'Br', 'N', 'Unknown']
reference_lists = [possible_atom_list]

def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception("input {0} not in allowable set{1}:".format(
            x, allowable_set))
    return list(map(lambda s: x == s, allowable_set))

def one_of_k_encoding_unk(x, allowable_set):
    """Maps inputs not in the allowable set to the last element."""
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))


def safe_index(l, e):
    """Gets the index of e in l, providing an index of len(l) if not found"""
    try:
        return l.index(e)
    except:
        return len(l)


def get_feature_list(atom):
    features[safe_index(possible_atom_list, atom.GetSymbol())]
    return features

def atom_features(atom):
    results = one_of_k_encoding_unk(atom.GetSymbol(), possible_atom_list)
    return np.array(results).astype(np.float32)

def bond_features(bond):
    bt = bond.GetBondType()
    bond_feats = [
        bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE,
        bt == Chem.rdchem.BondType.TRIPLE, bt == Chem.rdchem.BondType.AROMATIC]
    return np.array(bond_feats).astype(np.float32)

def get_bond_pair(mol):
    bonds = mol.GetBonds()
    res = [[], []]
    n_atoms = mol.GetNumAtoms()
    adj = np.zeros((n_atoms,n_atoms))
    for bond in bonds:
        res[0] += [bond.GetBeginAtomIdx()]
        res[1] += [bond.GetEndAtomIdx()]
        adj[bond.GetBeginAtomIdx(),bond.GetEndAtomIdx()]=1
        adj[bond.GetEndAtomIdx(),bond.GetBeginAtomIdx()]=1

    return res, adj


def mol2graph_data(mol):
    atoms = mol.GetAtoms()
    bonds = mol.GetBonds()
    node_f = [atom_features(atom) for atom in atoms]
    edge_index, adj = get_bond_pair(mol)
    edge_f = [bond_features(bond) for bond in bonds]
    return np.stack(node_f),np.stack(edge_index).T,np.stack(edge_f), adj


def n_atom_features():
    atom = Chem.MolFromSmiles('C').GetAtomWithIdx(0)
    return len(atom_features(atom))


def n_bond_features():
    bond = Chem.MolFromSmiles('CC').GetBondWithIdx(0)
    return len(bond_features(bond))

In [None]:
def mol_to_datatdict(mol):
  mol = df['mol'].iloc[0]
  nodes = np.array([atom_features(atom) for atom in mol.GetAtoms()])
  edges = []
  for bond in mol.GetBonds():
    edges.append(bond_features(bond))
    edges.append(bond_features(bond))
  edges = np.array(edges)
  senders = []
  receivers = []
  for bond in mol.GetBonds():
    senders.extend([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    receivers.extend([bond.GetEndAtomIdx(),bond.GetBeginAtomIdx()])

  data_dict = {'nodes':nodes.astype(np.float32), 'edges':edges.astype(np.float32),
              'globals':np.array([0]).astype(np.float32),
              'senders':np.array(senders),
              'receivers':np.array(receivers)}
  return data_dict

data_dict_train = [mol_to_datatdict(mol) for mol in mols_train]
data_dict_test = [mol_to_datatdict(mol) for mol in mols_test]

x_train = gn.utils_tf.data_dicts_to_graphs_tuple(data_dict_train)
x_test = gn.utils_tf.data_dicts_to_graphs_tuple(data_dict_test)
x_test

# Construir modelos

## Guardar resultados para la comparativa

In [None]:
results = []

def generar_resultado(y_true, y_pred, method, feature):
    result = OrderedDict()
    result['metodo'] = method
    result['feature'] = feature
    result.update(evaluate(y_true, y_pred))
    return result
    
pd.DataFrame([generar_resultado(y_true, y_true, 'exp', 'datos')])

Unnamed: 0,metodo,feature,MAE,R2
0,exp,datos,0.0,1.0


## El zologico de ML con scikit-learn

### Modelo lineal

$$
y = W \cdot x + b
$$

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_train)

## Random forest/ Gradient Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()


## Proceso Gaussiano

## Una red neuronal con tf y sonnet

Utilidades para modelos

In [None]:
def get_num_parameters(model: snt.Module, trainable: bool = True) -> int:
  """Numero de parametros."""
  variables = model.trainable_variables if trainable else model.variables
  return int(np.sum([np.prod(v.shape) for v in variables]))


def print_model(model: snt.Module):
  """Sumario de un modelo."""
  print(f'{model.__class__.__name__} : {model.name}\n')
  print(snt.format_variables(model.variables))
  n_params = get_num_parameters(model, False)
  trainable_params = get_num_parameters(model, True)
  print(f'\nParams: {trainable_params} trainable out of {n_params}')

In [None]:
model = snt.nets.MLP([10,10])
y_pred = model(x_train)
print_model(bloque)

### Definir variables de optimizacion

In [None]:
NUM_ITER = 100
optimizer = tf.optimizers.Adam(3e-4)
metric = tf.keras.metrics.MeanAbsoluteError()
loss_fn = tf.keras.losses.MeanSquaredError()

In [None]:
pbar = tqdm.tqdm(range(NUM_ITER))
stats = []

for i in pbar:
    with tf.GradientTape() as tape:
      y_pred = model(x_train)
      loss = loss_fn(y_train, y_pred)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    # Train statistics.
    y_pred =  model(x_train)
    stat = evaluate(y_train, y_pred , 'train')
    # Test statistics.
    y_pred =  model(x_test)
    stat.update(evaluate(y_test, y_pred, 'test'))
    stats.append(stat)
    # Update progress bar.
    pbar.set_postfix(stats[-1])
train_df = pd.DataFrame(stats)
train_df

NameError: name 'NUM_ITER' is not defined

In [None]:
for key in ['train_loss','test_loss']:
    plt.plot(train_df[key])
plt.yscale('log')
plt.legend()
plt.show()

for key in ['train_MAE','test_MAE']:
    plt.plot(train_df[key])
plt.show()

## Conseguir embeddings

## GNN: NN en grafos

### Bloque de transformacion

In [None]:
def make_mlp_model(latent_size=32, n_layers=2, add_head=0):
    """Crea un MLP luego pasando por un LayerNorm y opcionalmente una capa lineal."""
    layers = [
        snt.nets.MLP([latent_size] * n_layers, activate_final=True),
        snt.LayerNorm(axis=-1, create_offset=True, create_scale=True)]
    if add_head > 0:
        layers.append(snt.Linear(add_head))
    return snt.Sequential(layers)
    
bloque = make_mlp_model(32, 2, 0)
bloque(x_train.nodes)
print_model(bloque)

### Graphnets

In [None]:
cabeza = gn.modules.GraphIndependent(node_model_fn=lambda: snt.Linear(1))

gnn_layers = [gn.modules.GraphNetwork(
    edge_model_fn=lambda: make_mlp_model(32, 2),
    node_model_fn=lambda: make_mlp_model(32, 2),
    global_model_fn=lambda: make_mlp_model(32, 2)) for i in range(3)]

gnn = snt.Sequential( gnn_layers + [cabeza])
out = gnn(x_train)
print_model(gnn)

In [None]:
NUM_ITER = 100
optimizer = tf.optimizers.Adam(3e-4)
metric = tf.keras.metrics.MeanAbsoluteError()
loss_fn = tf.keras.losses.MeanSquaredError()




In [None]:
pbar = tqdm.tqdm(range(NUM_ITER))
stats = []

for i in pbar:
    with tf.GradientTape() as tape:
      y_pred = model(x_train)
      loss = loss_fn(y_train, y_pred)
    grads = tape.gradient(loss, gnn.trainable_variables)
    optimizer.apply_gradients((grad, var) 
    for (grad, var) in zip(grads, gnn.trainable_variables) 
    if grad is not None)
    # Train statistics.
    y_pred =  gnn(x_train).globals
    stat = evaluate(y_train, y_pred , 'train')
    # Test statistics.
    y_pred =  gnn(x_test).globals
    stat.update(evaluate(y_test, y_pred, 'test'))
    stats.append(stat)
    # Update progress bar.
    pbar.set_postfix(stats[-1])
train_df = pd.DataFrame(stats)
train_df

In [None]:
for key in ['train_loss','test_loss']:
    plt.plot(train_df[key])
plt.yscale('log')
plt.legend()
plt.show()

for key in ['train_MAE','test_MAE']:
    plt.plot(train_df[key])
plt.show()