# Notebook for preparing and saving MOLECULAR graphs

In [None]:
import numpy as np
import torch
import pickle
import time
import os
%matplotlib inline
import matplotlib.pyplot as plt


# Download ZINC dataset

In [None]:
if not os.path.isfile('molecules.zip'):
    print('downloading..')
    !curl https://www.dropbox.com/s/feo9qle74kg48gy/molecules.zip?dl=1 -o molecules.zip -J -L -k
    !unzip molecules.zip -d ../
    # !tar -xvf molecules.zip -C ../
else:
    print('File already downloaded')
    

# Convert to DGL format and save with pickle

In [None]:
import os
os.chdir('../../') # go to root folder of the project
print(os.getcwd())


In [None]:
import pickle

%load_ext autoreload
%autoreload 2

from data.molecules import MoleculeDatasetDGL 

from data.data import LoadData
from torch.utils.data import DataLoader
from data.molecules import MoleculeDataset


In [None]:
DATASET_NAME = 'ZINC'
dataset = MoleculeDatasetDGL(DATASET_NAME) 


In [None]:
def plot_histo_graphs(dataset, title):
    # histogram of graph sizes
    graph_sizes = []
    for graph in dataset:
        graph_sizes.append(graph[0].number_of_nodes())
    plt.figure(1)
    plt.hist(graph_sizes, bins=20)
    plt.title(title)
    plt.show()
    graph_sizes = torch.Tensor(graph_sizes)
    print('min/max :',graph_sizes.min().long().item(),graph_sizes.max().long().item())
    
plot_histo_graphs(dataset.train,'trainset')
plot_histo_graphs(dataset.val,'valset')
plot_histo_graphs(dataset.test,'testset')


In [None]:
print(len(dataset.train))
print(len(dataset.val))
print(len(dataset.test))

print(dataset.train[0])
print(dataset.val[0])
print(dataset.test[0])


In [None]:
num_atom_type = 28
num_bond_type = 4


In [None]:
start = time.time()
with open('data/molecules/ZINC.pkl','wb') as f:
        pickle.dump([dataset.train,dataset.val,dataset.test,num_atom_type,num_bond_type],f)
print('Time (sec):',time.time() - start)


# Test load function

In [None]:
DATASET_NAME = 'ZINC'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test


In [None]:
batch_size = 10
collate = MoleculeDataset.collate
print(MoleculeDataset)
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate)


In [None]:
os.chdir('./data/molecules/') # go to root folder of the project
print(os.getcwd())

# Download AqSol dataset

In [None]:
if not os.path.isfile('aqsol_graph_raw.zip'):
    print('downloading..')
    !curl https://www.dropbox.com/s/lzu9lmukwov12kt/aqsol_graph_raw.zip?dl=1 -o aqsol_graph_raw.zip -J -L -k
    !unzip aqsol_graph_raw.zip -d ./
else:
    print('File already downloaded')

# Convert to DGL format and save with pickle

In [None]:
os.chdir('../../') # go to root folder of the project
print(os.getcwd())

In [None]:
DATASET_NAME = 'AqSol'
dataset = MoleculeDatasetDGL(DATASET_NAME)

In [None]:
plot_histo_graphs(dataset.train,'trainset')
plot_histo_graphs(dataset.val,'valset')
plot_histo_graphs(dataset.test,'testset')

In [None]:
print(len(dataset.train))
print(len(dataset.val))
print(len(dataset.test))

print(dataset.train[0])
print(dataset.val[0])
print(dataset.test[0])

In [None]:
num_atom_type = dataset.num_atom_type
num_bond_type = dataset.num_bond_type

In [None]:
start = time.time()
with open('data/molecules/AQSOL.pkl','wb') as f:
        pickle.dump([dataset.train,dataset.val,dataset.test,num_atom_type,num_bond_type],f)
print('Time (sec):',time.time() - start)

# Test load function

In [None]:
DATASET_NAME = 'AQSOL'
dataset = LoadData(DATASET_NAME)
trainset, valset, testset = dataset.train, dataset.val, dataset.test

In [None]:
batch_size = 10
collate = MoleculeDataset.collate
print(MoleculeDataset)
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate)