# Shape of Molecules #

In this 

In [None]:
#Import statements
import warnings; warnings.simplefilter('ignore')
import random 


import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

from giotto.graphs.create_clique_complex import CreateCliqueComplex, CreateBoundaryMatrices, CreateLaplacianMatrices    
from giotto.graphs.heat_diffusion import HeatDiffusion
from giotto.graphs.graph_entropy import GraphEntropy

from molecules import mol_to_nx, compute_node_edge_entropy, bonds_type, graph_to_points
from plotting import plot_entropies, plot_network_diffusion

from rdkit import Chem 
from rdkit.Chem import Draw


## Import and Convert data to networkx Graph ##

Import molecules dataset and convert them: $\textit{smiles}$ --> $\textit{rdkit.Chem.rdchem.Mol}$ --> $\textit{networkx.graph}$.

In [None]:
df = pd.read_csv('hiv.csv')
df['g_mol'] = df['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
df.drop("smiles", axis=1, inplace=True)
g_mol = [mol_to_nx(df['g_mol'][i]) for i in range(df.shape[0]) if i != 559 and i!= 8097 ]

## Plot one molecule ##

In [None]:
g = g_mol[0]
mol = [df['g_mol'][0]]

nx.draw(g, pos=nx.spring_layout(g))
Draw.MolsToGridImage(mol, molsPerRow=5, useSVG=False)


## Create Embedding for all molecules of the dataset ##

In [None]:
# Hyperparameters
taus_n = np.linspace(0,2,30)
taus_e = np.linspace(0,2,30)
#Embedding
embeds = [compute_node_edge_entropy(x,i, taus_n, taus_e) for i,x in enumerate(g_mol) ]

## Save ##

In [None]:
#Save all the Embeddings 
import pickle 
pickle.dump(embeds, open("hiv_embeds.pickle", "wb"))

## Load ##

In [None]:
#Load it and look at the dimensions 
import pickle
embeds = pickle.load(open("hiv_embeds.pickle", 'rb'))
print("Dataset loaded...")

# Adding Bonds information #

In [None]:
bonds_type(g_mol)

#Create a list with all nodes 
freq_type_bonds = list()
for g in g_mol:
    freq_type_bonds.extend(list(nx.get_node_attributes(g, 'bonds_one_hot').values()))

In [None]:
freq_type_bonds = np.array(freq_type_bonds)
print("Total number of atoms in the dataset: {}".format(freq_type_bonds.shape[0]))
print("Entry size: {}".format(freq_type_bonds.shape[1]))

# Universal Node Embeddding #

In [None]:
universal_nodes = list()
for i in range(len(embeds)):
    universal_nodes.extend(np.split(embeds[i][0][1:,:].T, embeds[i][0][0,:].shape[0]))
universal_nodes = np.squeeze(np.array(universal_nodes))

In [None]:
uni_frq_nodes = [np.hstack([universal_nodes[x,:], freq_type_bonds[x,:]]) for x in range(universal_nodes.shape[0])]
uni_frq_nodes = np.array(uni_frq_nodes)

In [None]:
mol_to_atom = graph_to_points(g_mol, 0)

In [None]:
kmeans_n = KMeans(n_clusters=10)
universal_class_nodes = kmeans_n.fit_transform(uni_frq_nodes)
 
centroids_n = kmeans_n.cluster_centers_

# Universal Edge Emebdding #

In [None]:
universal_edges = list()
for i in range(len(embeds)):
    universal_edges.extend(np.split(embeds[i][1][1:,:].T, embeds[i][1][0,:].shape[0]))
universal_edges = np.squeeze(np.array(universal_edges))

In [None]:
"""for i, g in enumerate(g_mol):
    d_e = dict()
    for e in g.edges():
        b = int(g.get_edge_data(e[0], e[1])['bond_type'])
        d_e[e] = np.zeros(4)
        if (b == 12):
            d_e[e][3] = 1
        else:
            d_e[e][b-1] = 1
    nx.set_edge_attributes(g, name='bonds_one_hot', values=d_e)
    """

In [None]:
"""
freq_type_bonds_edge = list()
for g in g_mol:
    freq_type_bonds_edge.extend(list(nx.get_edge_attributes(g, 'bonds_one_hot').values()))
    """

In [None]:
"""
uni_frq_edge = list()
for x in range(universal_edge.shape[0]):
    uni_frq_edge.append(np.hstack([universal_edge[x,:], freq_type_bonds_edge[x,:]]))
"""

In [None]:
kmeans_e = KMeans(n_clusters=10)
universal_class_edge = kmeans_e.fit_transform(universal_edges)

centroids_e = kmeans_e.cluster_centers_

In [None]:
mol_to_bonds = graph_to_points(g_mol, 1)

In [None]:
#Molecule sizes frequency

dims = [t.number_of_nodes() for t in g_mol]
_ = plt.hist(dims, bins=100)
_ = plt.title("Molecule sizes frequency")

# Data with universal embedding #


### Nodes ###

In [None]:
one_hot_encoded = np.zeros((universal_nodes.shape[0], 10))
for x in range(len(universal_points)):
    one_hot_encoded[x][universal_class[x]] = 1

In [None]:
#Soft Encoded
# soft_encoded_node = np.zeros((uni_frq.shape[0], 10))
soft_encoded_node = [[ np.exp( - (np.linalg.norm(uni_frq_nodes[x]- centroids_n[c], 2) ** 2) / 2) for c in range(10)] for x in range(uni_frq_nodes.shape[0])]
soft_encoded_node = np.array(soft_encoded_node)

In [None]:
#Normalize
for x in soft_encoded_node:
    x /= np.sum(x)

In [None]:
#Create data for each graph
x_data_node = list()
for i in range(len(g_mol)):
    #if (g_mol[i].number_of_nodes() > 10 and g_mol[i].number_of_nodes() < 100):
    #v = np.zeros((soft_encoded_node[0].shape[0]))
    #v = np.zeros((one_hot_encoded[0].shape[0]))
    #for n in graph_to_points[i]:
        #v += soft_encoded_node[n]
        #v += one_hot_encoded[n]
        #x_data.append(np.hstack([v,t]))
        
    x_data_node.append(np.sum([soft_encoded_node[n] for n in mol_to_atom[i]], axis=0))
x_data_node = np.array(x_data_node)
x_data_node.shape

### Edges ###

In [None]:
"""
one_hot_encoded_edge = np.zeros((universal_class_edge.shape[0], 10))
for x in range(len(universal_edge)):
    one_hot_encoded_edge[x][universal_class_edge[x]] = 1
    """

In [None]:
#Soft Encoded
soft_encoded_edge = [[ np.exp( - (np.linalg.norm(universal_edges[x]- centroids_e[c], 2) ** 2) / 2) for c in range(10)] for x in range(universal_edges.shape[0])]
soft_encoded_edge = np.array(soft_encoded_edge)

In [None]:
for x in soft_encoded_edge:
    x /= np.sum(x) 

In [None]:
#Create edge data for each graph
x_data_edge = list()
for i in range(len(g_mol)):
    #if (g_mol[i].number_of_nodes() > 10 and g_mol[i].number_of_nodes() < 100):
        #v = np.zeros((soft_encoded_edge[0].shape[0]))
        #v = np.zeros((one_hot_encoded_edge[0].shape[0]))
        #for n in graph_to_edges[i]:
            #v += soft_encoded_edge[n]
            #v += one_hot_encoded_edge[n]
    x_data_edge.append(np.sum([soft_encoded_edge[n] for n in mol_to_bonds[i]], axis=0))
x_data_edge = np.array(x_data_edge)
x_data_edge.shape

In [None]:
x_data = np.hstack([x_data_node, x_data_edge])

In [None]:
x_data -= np.mean(x_data, axis=0)
x_data /= (np.max(x_data, axis=0) - np.min(x_data, axis=0))

# Classification Model #

In [None]:
#Prepare y_data
y_data = [df['HIV_active'][i] for i in range(df.shape[0]) if i != 8079 and i != 559]
y_data = np.array(y_data)
y_data.shape

In [None]:
import random

In [None]:
f = np.arange(41911)
random.Random(10).shuffle(f)
train = f[:35000]

In [None]:
#f[35000:35010]

In [None]:


#np.random.seed=np.random.randint(0,123456)
np.random.shuffle(train)



i_train = train[:29000]
i_val = train[29000:35000]
i_test = f[35000:]

x_train = x_data[i_train, :]
x_val = x_data[i_val, :] 

y_train = y_data[i_train]
y_val = y_data[i_val]

x_test = np.array([np.array(x_data[i,:]) for i in f[35000:]])
y_test = np.array([y_data[i] for i in f[35000:]])

In [None]:
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras import optimizers

from sklearn.metrics import roc_auc_score

In [None]:
## # define the keras model
model = Sequential()

model.add(Dense(32, activation='relu'))
model.add(Dropout(rate=0.5))
#model.add(BatchNormalization())

model.add(Dense(64, activation='relu'))
model.add(Dropout(rate=0.5))
#model.add(BatchNormalization())

model.add(Dense(1, activation='sigmoid'))

In [None]:
adam = optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:

model.fit(x_train, y_train, epochs=100,validation_data=(x_val, y_val), batch_size=128)

In [None]:

# evaluate the keras model

pe, accuracy = model.evaluate(x_val, y_val)
print('Accuracy: %.2f' % (accuracy*100))



In [None]:
p_train = model.predict(x_train)
p_val = model.predict(x_val)
p_test = model.predict(x_test)

print(" Train AUC-ROC :")
print(roc_auc_score(y_train, p_train))

print(" Valid AUC-ROC :")
print(roc_auc_score(y_val, p_val))

print(" Test AUC-ROC :")
print(roc_auc_score(y_test, p_test))