In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader
import glob
import wandb
import os
import torch.optim as optimizers
import dfs_code
from torch_geometric.data import InMemoryDataset, Data
import pickle
import torch
import torch.nn as nn
import tqdm
import copy
import pandas as pd
import torch.nn.functional as F
import sys
import yaml
import functools
from ml_collections import ConfigDict
sys.path = ['../../src'] + sys.path
from dfs_transformer import DFSCodeSeq2SeqFC, Deepchem2TorchGeometric, Trainer, to_cuda

import random

# download pretrained model

In [3]:
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [4]:
run = wandb.init(mode="online", 
                 project="pubchem", 
                 entity="dfstransformer", 
                 job_type="inference")
model_at = run.use_artifact("bertloops0.3-10M-nofeats" + ":latest")
model_dir = model_at.download()
run.finish()
features = None# "chemprop"
n_molecules = 10
n_samples = 200
fingerprint = 'min-mean-max-std'
load_flag = True

2022/10/28 18:31:39	ERROR	wandb.jupyter	Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meth-compiler-opters[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.13.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Downloading large artifact bertloops0.3-10M-nofeats:latest, 95.62MB. 2 files... Done. 0:0:0


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [5]:
with open(model_dir+"/config.yaml") as file:
    config = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))

In [6]:
device = torch.device('cuda:%d'%config.training.gpu_id if torch.cuda.is_available()  else 'cpu')
device = 'cpu'

In [7]:
m = config.model

In [8]:
model = DFSCodeSeq2SeqFC(**m)
if load_flag:
    model.load_state_dict(torch.load(model_dir+'/checkpoint.pt', map_location=device))

RuntimeError: Error(s) in loading state_dict for DFSCodeSeq2SeqFC:
	Missing key(s) in state_dict: "fcs.dfs_from.weight", "fcs.dfs_from.bias", "fcs.dfs_to.weight", "fcs.dfs_to.bias", "fcs.atomic_num_from.weight", "fcs.atomic_num_from.bias", "fcs.atomic_num_to.weight", "fcs.atomic_num_to.bias", "fcs.formal_charge_from.weight", "fcs.formal_charge_from.bias", "fcs.formal_charge_to.weight", "fcs.formal_charge_to.bias", "fcs.chiral_tag_from.weight", "fcs.chiral_tag_from.bias", "fcs.chiral_tag_to.weight", "fcs.chiral_tag_to.bias", "fcs.num_Hs_from.weight", "fcs.num_Hs_from.bias", "fcs.num_Hs_to.weight", "fcs.num_Hs_to.bias", "fcs.hybridization_from.weight", "fcs.hybridization_from.bias", "fcs.hybridization_to.weight", "fcs.hybridization_to.bias", "fcs.is_aromatic_from.weight", "fcs.is_aromatic_from.bias", "fcs.is_aromatic_to.weight", "fcs.is_aromatic_to.bias", "fcs.bond_type.weight", "fcs.bond_type.bias". 
	Unexpected key(s) in state_dict: "fc_dfs_idx1.weight", "fc_dfs_idx1.bias", "fc_dfs_idx2.weight", "fc_dfs_idx2.bias", "fc_atom1.weight", "fc_atom1.bias", "fc_atom2.weight", "fc_atom2.bias", "fc_bond.weight", "fc_bond.bias". 

In [None]:
model.to(device)

In [None]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("number of trainable parameters %d"%params)

# load dataset

In [None]:
trainset = pd.read_csv("../../datasets/mymoleculenet/bbbp/0/train.csv")
train_X, train_y = trainset["smiles"].to_numpy(), trainset["target"].to_numpy()
traindata = Deepchem2TorchGeometric(train_X, train_y, loaddir="../../results/mymoleculenet_plus_features/bbbp/1/", features=features)

In [None]:
traindata = [d for d in traindata if d.edge_features.shape[0] == 40]

In [None]:
def collate_fn(dlist, n_samples=n_samples):
    node_batch = [] 
    edge_batch = []
    y_batch = []
    code_batch = []
    smiles = []
    
    for d in dlist:
        smiles += d.smiles 
        for _ in range(n_samples):
            edge_features = d.edge_features.clone()

            code, index = dfs_code.rnd_dfs_code_from_torch_geometric(d, d.z.numpy().tolist(), 
                                                                     np.argmax(d.edge_attr.numpy(), axis=1).tolist())

            code = torch.tensor(code, dtype=torch.long)
            index = torch.tensor(index, dtype=torch.long)
            code_batch += [code]
            node_batch += [d.node_features.clone()]
            edge_batch += [edge_features]
            y_batch += [d.y.clone()]
    y = torch.cat(y_batch).unsqueeze(1)
    return smiles, code_batch, node_batch, edge_batch, y

In [None]:
trainloader = DataLoader(traindata, batch_size=1, shuffle=False, pin_memory=False, 
                         collate_fn=collate_fn)

In [None]:
encodings = []
encodings2 = []
labels = []
dfs_codes = []
smiles = []
iterator = iter(trainloader)
for i in range(n_molecules):
    d = next(iterator)
    smiles += [d[0]]
    d = d[1:]
    d = [to_cuda(dd, device) for dd in d]
    encodings.append(model.encode(*d[:-1], fingerprint).detach().cpu().numpy())
    labels += len(d[0])*[i]
    dfs1, dfs2, atm1, atm2, bnd = model(*d[:-1])
    torch
    print(dfs1.shape)
    enc2 = torch.cat((dfs1.mean(dim=0), dfs2.mean(dim=0), atm1.mean(dim=0), atm2.mean(dim=0), bnd.mean(dim=0)), dim=1)
    encodings2.append(enc2.detach().cpu().numpy())
encodings = np.concatenate(encodings, axis=0)
encodings2 = np.concatenate(encodings2, axis=0)

# sklearn kmeans benchmark

In [None]:
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                 metric="euclidean", sample_size=300,)
    ]

    # Show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))

In [None]:
data = encodings.astype(np.float64)

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

kmeans = KMeans(init="k-means++", n_clusters=n_molecules, n_init=4,
                random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_molecules, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)

pca = PCA(n_components=n_molecules).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_molecules, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)

print(82 * '_')

10K
init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	3.851s	1299580	0.959	0.966	0.962	0.924	0.962	0.139
random   	2.292s	1305407	0.940	0.970	0.954	0.870	0.954	0.135
PCA-based	0.554s	1383451	0.896	0.927	0.911	0.802	0.910	0.103

100K init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	0.753s	1613620	0.873	0.874	0.874	0.835	0.871	0.066
random   	0.349s	1610052	0.740	0.763	0.751	0.606	0.746	0.075
PCA-based	0.165s	1645957	0.724	0.760	0.742	0.577	0.737	0.071

1M init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	5.437s	1489769	0.761	0.782	0.771	0.621	0.767	0.047
random   	3.327s	1493680	0.810	0.838	0.824	0.702	0.820	0.050
PCA-based	1.175s	1571641	0.736	0.754	0.745	0.575	0.740	0.031

In [None]:
import matplotlib.pyplot as plt

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=n_molecules, n_init=4)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation="nearest",
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired, aspect="auto", origin="lower")

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
            color="w", zorder=10)
plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n"
          "Centroids are marked with white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

In [None]:
y = np.asarray(labels)

In [None]:
import matplotlib.colors as mcolors
by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgb(color))),
                 name)
                for name, color in mcolors.CSS4_COLORS.items())
names = [name for hsv, name in by_hsv]

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(data)

target_ids = range(n_molecules)

from matplotlib import pyplot as plt
plt.figure(figsize=(6, 5))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple']
colors = names[::-1][0:10*n_molecules:10]
for i, c, label in zip(target_ids, colors, target_ids):
    #print(c)
    plt.scatter(X_2d[y == i, 0], X_2d[y == i, 1], c=c, label=label)
plt.legend(loc=1)
plt.show()

In [None]:
n_molecules

In [None]:
''.join(smiles[0])

In [None]:
''.join(smiles[4])