https://www.repository.cam.ac.uk/handle/1810/307452

The first time you run this notebook, uncomment and execute the following cell

In [2]:
#!wget https://www.repository.cam.ac.uk/bitstream/handle/1810/307452/Carbon_GAP_20.tgz
#!tar -xzvf Carbon_GAP_20.tgz

In [40]:
from ase.io import iread
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from scipy.spatial import distance_matrix
import numpy as np

In [3]:
class MyDataset(Dataset):
    def __init__(self):
        self.xyz = []
        self.E = []
        for mol in iread('Carbon_GAP_20/Carbon_GAP_20_Training_Set.xyz'):
            self.xyz.append(mol.get_positions())
            self.E.append(mol.get_potential_energy())

    def __len__(self):
        return len(self.E)

    def __getitem__(self, idx):
        return self.xyz[idx], self.E[idx]

In [4]:
dataset = MyDataset()

In [5]:
dataloader = DataLoader(dataset)

Ideas:
<ul>
<li> Fully connected graph with distance
<li> Partially connected graph with nearest neighbors
<li> Enforce rotation and translation invariance before transformation into graph
</ul>

In [194]:
print(len(dataset)) # total number of molecules
print(len(dataset[0])) # matrix of both positions and energy
print(len(dataset[0][0])) # one molecule's geometry/positions
print(dataset[0][1]) # one molecule's potential energy

6088
2
125
-767.80938844


In [224]:
# assumes g is in m so the first closest neighbor is excluded
# returns both list of nearest neighbors and their distances
def nearest_neighbors(g, m, k):
    '''
        g --> (3) one coordinate used as reference point
        m --> (x,3) whole molecule geometry 
        k --> (1) number of nearest neighbors to be found
    '''
    if k >= len(m):
        print("Error: there are not enough points for desired number of neighbors.")
        print("Choose a different number less than "+str(len(m))+".")
        return None
    
    dist = distance_matrix([g], m)
    indices = np.argpartition(dist[0], range(k+1))[1:k+1] #(k)[:k] to include first closest neighbor
    k_nearest = []
    k_dist = []
    for idx in indices:
        k_nearest.append(m[idx])
        k_dist.append(dist[0][idx])
    return k_nearest, k_dist

In [225]:
test = dataset[5][0]
test[0]

array([5.72412807, 0.58084565, 0.61387452])

In [226]:
nearest_neighbors(test[0], test, 5)

([array([4.91289125, 0.58084565, 1.84162356]),
  array([6.69513208, 1.74253626, 0.61387452]),
  array([3.54733216, 0.58084565, 1.84162356]),
  array([7.09853279, 2.30838331, 1.84162356]),
  array([2.81099357, 0.58084565, 0.61387452])],
 [1.471554580484405,
  1.514058737565439,
  2.499161487181981,
  2.5260131445219116,
  2.9131345])