https://www.repository.cam.ac.uk/handle/1810/307452

The first time you run this notebook, uncomment and execute the following cell

In [None]:
#!wget https://www.repository.cam.ac.uk/bitstream/handle/1810/307452/Carbon_GAP_20.tgz
#!tar -xzvf Carbon_GAP_20.tgz

In [18]:
from ase.io import iread
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from scipy.spatial import distance_matrix
import numpy as np
import dgl
from ase.calculators.mopac import MOPAC
from ase import Atoms

In [2]:
class MyDataset(Dataset):
    def __init__(self):
        self.xyz = []
        self.E = []
        for mol in iread('Carbon_GAP_20/Carbon_GAP_20_Training_Set.xyz'):
            self.xyz.append(mol.get_positions())
            self.E.append(mol.get_potential_energy())

    def __len__(self):
        return len(self.E)

    def __getitem__(self, idx):
        return self.xyz[idx], self.E[idx]

In [3]:
dataset = MyDataset()

In [4]:
dataloader = DataLoader(dataset)

Ideas:
<ul>
<li> Fully connected graph with distance
<li> Partially connected graph with nearest neighbors
<li> Enforce rotation and translation invariance before transformation into graph
</ul>

In [5]:
print(len(dataset)) # total number of molecules
print(len(dataset[0])) # matrix of both positions and energy
print(len(dataset[0][0])) # one molecule's geometry/positions
print(dataset[0][1]) # one molecule's potential energy

6088
2
125
-767.80938844


In [115]:
def nearest_neighbors(g, m, k):
    '''
        g --> (3) one coordinate used as reference point
        m --> (x,3) whole molecule geometry
        k --> (1) number of nearest neighbors to be found
        - assumes g is in m so the first closest neighbor is excluded
        - calculates the actual neighbors, the distances, and their indices in the list of atoms
    '''
    if k >= len(m):
        #print("Error: there are not enough points for desired number of neighbors.")
        #print("Choose a different number less than "+str(len(m))+".")
        #return None
        k = len(m)-1
        
    dist = distance_matrix([g], m)
    if len(m)==1:
    # if single atom, itself will be its neighbor
        k = 1
        indices = np.argpartition(dist[0], range(k))[:k] 
    else:
        indices = np.argpartition(dist[0], range(k+1))[1:k+1] # excludes first closest neighbor (itself)
    k_nearest = []
    k_dist = []
    for idx in indices:
        k_nearest.append(m[idx])
        k_dist.append(dist[0][idx])
    return k_nearest, k_dist, indices

In [134]:
test = dataset[3]
test

(array([[5.710166  , 3.72909598, 3.32131804],
        [2.59837204, 5.030634  , 5.14362803],
        [3.67931201, 5.00652601, 5.97878404],
        [4.64755196, 5.78209798, 5.40881598],
        [5.72378796, 5.005056  , 5.11942199],
        [4.18371803, 3.73948396, 6.05522397],
        [5.43625597, 3.72625402, 5.51759601],
        [5.43449202, 2.95127001, 4.39863198],
        [2.44999998, 3.747324  , 4.70370603],
        [3.41804403, 2.94842796, 5.25417204],
        [4.17019398, 2.44999997, 4.23193401],
        [2.92843597, 5.00858398, 2.98782404],
        [2.64021803, 3.72850802, 3.36169404],
        [3.694796  , 2.9204    , 3.04760403],
        [4.94958802, 5.81051802, 3.23115799],
        [4.18381602, 5.00740799, 2.45000003],
        [4.63353803, 3.72066798, 2.48214399],
        [5.90401002, 5.00280199, 3.77643002],
        [2.90246602, 5.83188203, 4.0768    ],
        [4.16245198, 6.30581003, 4.23281601]]),
 -130.06626593)

In [118]:
nearest_neighbors(test[0], test, 5)

([array([5.43449202, 2.95127001, 4.39863198]),
  array([4.63353803, 3.72066798, 2.48214399]),
  array([5.90401002, 5.00280199, 3.77643002]),
  array([4.18381602, 5.00740799, 2.45000003]),
  array([3.694796  , 2.9204    , 3.04760403])],
 [1.3570610554333233,
  1.365068460980519,
  1.36639284919799,
  2.1732512351098756,
  2.1887495937723074],
 array([ 7, 16, 17, 15, 13]))

In [119]:
# g1 = [0,1,0]
# g2 = [[0,4,0],[0,1,0],[0,6,0],[0,3,0],[0,2,0],[0,5,0]]
# nearest_neighbors(g1,g2,3)

In [120]:
def xyz_to_graph(molecule, k, node_featurizer, edge_featurizer):
    '''
        molecule --> (x,3) whole molecule geometry
        k --> (1) number of nearest neighbors to be found
        - creates a graph of the molecule where each atom is connected to its k nearest neighbors
        - featurizes the nodes with the energy?? and the edges with distance
    '''    
    c = Atoms('C', positions=[[0, 0, 0]])
    c.calc = MOPAC(label='C', task='PM7 1SCF UHF')
    energy = c.get_potential_energy()
    
    src = []
    dest = []
    ndist = []
    c_e = []
    for atom in range(len(molecule)):
        nbhd, dist, idx = nearest_neighbors(molecule[atom], molecule, k)
        c_e.append(energy) 
        for i in range(len(nbhd)):
            src.append(atom)
            dest.append(idx[i])
            ndist.append(dist[i])
    g = dgl.graph((torch.tensor(src), torch.tensor(dest)))
    
    if node_featurizer is True:
        g.ndata.update({'energy': torch.tensor(c_e)})

    if edge_featurizer is True:
        g.edata.update({'length': torch.tensor(ndist)})
    
    return g

In [121]:
g = xyz_to_graph(test, 3, node_featurizer=True, edge_featurizer=True)
g

Graph(num_nodes=20, num_edges=60,
      ndata_schemes={'energy': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'length': Scheme(shape=(), dtype=torch.float64)})

In [122]:
g.edata

{'length': tensor([1.3571, 1.3651, 1.3664, 1.3647, 1.3662, 1.3684, 1.3652, 1.3659, 1.3662,
        1.3586, 1.3652, 1.3757, 1.3550, 1.3586, 1.3699, 1.3615, 1.3631, 1.3659,
        1.3611, 1.3631, 1.3699, 1.3571, 1.3611, 1.3702, 1.3556, 1.3647, 1.3705,
        1.3615, 1.3635, 1.3705, 1.3601, 1.3635, 1.3702, 1.3643, 1.3654, 1.3657,
        1.3556, 1.3643, 1.3652, 1.3570, 1.3601, 1.3652, 1.3571, 1.3641, 1.3668,
        1.3571, 1.3634, 1.3657, 1.3570, 1.3634, 1.3651, 1.3550, 1.3641, 1.3664,
        1.3552, 1.3654, 1.3684, 1.3552, 1.3668, 1.3757], dtype=torch.float64)}

In [123]:
g.ndata

{'energy': tensor([-115.1573, -115.1573, -115.1573, -115.1573, -115.1573, -115.1573,
        -115.1573, -115.1573, -115.1573, -115.1573, -115.1573, -115.1573,
        -115.1573, -115.1573, -115.1573, -115.1573, -115.1573, -115.1573,
        -115.1573, -115.1573])}

In [18]:
# check with dummy example
# potential energy
# see what software caarbon gap paper uses to compute

In [None]:
# create torch dataset like in class
# choose neural net (one round of mpnn and mlp predictor hw8 network) and train (use cs machine and wandb)
# read paper more throughly, what they do and what they look at with this dataset

In [None]:
# dataset[4782][0] is a single point