## Create 20 atomic models of a 2x2x2 MAPbI3 containing two water molecules randomly displaced and select 10 according to Farthest Point Sampling

In [None]:
from skmatter.sample_selection import FPS
import numpy as np
import pandas as pd
from numpy.linalg import norm
from scipy.spatial.distance import pdist, squareform

#from ase.io import read,write
from ase.visualize import view,ngl
from ase.spacegroup import crystal
#from ase.spacegroup import Spacegroup
from ase.data import atomic_numbers, atomic_names
from ase import Atoms
from ase import neighborlist
from itertools import product

import nglview as nv
from ase.build import molecule

from rascal.representations import SphericalInvariants
#from rascal.models import Kernel

### add missing functions to compute SOAP descriptors checking the notebook TEST_SOAP (points:2)

In [None]:
...

In [None]:
def view_structure(structure,myvec=[]):
    t = nv.ASEStructure(structure)
    w = nv.NGLWidget(t, gui=True)
    w.add_unitcell()
    w.add_ball_and_stick()
    w.add_representation('label',label_type='atomindex',color='black')
    w.add_representation('spacefill',selection=myvec,color="blue",radius=0.5)
    return w

In [None]:
MAPbI3=crystal(
    symbols=['Pb','I','I','N','C','H','H','H','H'],
    basis=[(0.5,0,0),(0.4842,0.25,-0.0562),
           (0.1886,0.0147,0.1844),(0.9421,0.75,0.0297),
           (0.9372,0.25,0.0575),
           (0.9372,0.25,0.1874),(0.8661,0.1701,0.0290),
           (0.1275,0.1891,-0.0085),(0.9543,0.75,0.1459)
          ],
    spacegroup=62,
    cellpar=[8.86574, 12.6293, 8.57689, 90, 90, 90])

In [None]:
#create the 2x2x2 supercell from the unit cell
supercell_no_h2o=MAPbI3.repeat((2,2,2))

In [None]:
#ASE instructions to initialize neighbors list
cutOff = neighborlist.natural_cutoffs(supercell_no_h2o)
nl = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
nl.update(supercell_no_h2o)

#identify NH3 molecules in the crystal
all_N = [atom.index for atom in supercell_no_h2o if atom.symbol == 'N']
all_H_of_N = [index for N in all_N for index in nl.get_neighbors(N)[0] if supercell_no_h2o[index].symbol == 'H'  ]
all_nh3 = all_N + all_H_of_N

In [None]:
### Loop to generate nsamples=20 structures with random positioning of NH2O=2 water molecules

In [None]:
NH2O=2
#dmin and dmax are used to impose some constraints (e.g. distance from N atoms) 
#on the possible positioning of H2O molecules 
nsamples=20
dmin = 1.5
dmax = 2.5
orig_h2o=molecule('H2O')
#transalte the molecule to have Oxygen in (0,0,0)
orig_h2o.translate(-1*orig_h2o.positions[0])
samples=[]
ns=0
while ns < nsamples:
    nh2o = 0
    print("Creating sample ",ns)
    supercell = supercell_no_h2o.copy()
    while nh2o < NH2O:
        h2o = orig_h2o.copy()
        oldcell=supercell.copy()
        t_vector = np.random.uniform(low=-1,high=1,size=(3))
        t_vector = t_vector/np.linalg.norm(t_vector)

        #position h2o within 1.5A---3.0A from a N atom
        t_vector *= np.random.uniform(low=dmin,high=dmax)
        a_random_N = all_N[np.random.randint(low=0,high=len(all_N))]
        t_vector += supercell.positions[a_random_N]
        
        #random rotation of h2o
        r_vector = np.random.uniform(low=-1,high=1,size=(3))
        r_vector = r_vector/np.linalg.norm(r_vector)
        rot_axis=np.random.uniform(low=-1,high=1,size=(3))
        rot_axis = rot_axis/np.linalg.norm(rot_axis) 
        h2o.rotate(np.random.uniform(low=0,high=180),t_vector,center=(0,0,0))
        
        #position h2o
        trial_h2o = h2o.copy()
        trial_h2o.translate(t_vector)
        supercell=supercell + trial_h2o
        natoms=len(supercell) 
        #O of the added h2o molecule is the third last atom: supercell[-3]
        #shortest_O_N_distances=min(supercell.get_distances(supercell[-3].index, all_N, mic=True, vector=False))
        discard=False
        for ih2o,j in product(range(natoms - 3,natoms), range(natoms-3)) :
            if supercell.get_distance(ih2o,j,mic=True,vector=False) < dmin:
                discard = True
                break
        if discard:
            supercell = oldcell.copy()
        else:
            nh2o+=1
    ns+=1
    samples.append(supercell)

### Explain in few words what we are doing above. This does not look like a Monte Carlo procedure to add H2O molecules, what would we need for implementing MC steps? (points : 2)

In [None]:
for structure in samples:
    structure.wrap()
    structure.pbc=(1,1,1)

In [None]:
hypers = {
    "soap_type":"PowerSpectrum",
    "interaction_cutoff": 5.0,
    "max_radial": 6,
    "max_angular": 6,
    "gaussian_sigma_constant": 0.4,
    "gaussian_sigma_type":"Constant",
    "cutoff_smooth_width":0.5,
    "radial_basis": "GTO",
    "cutoff_function_type": "ShiftedCosine",
    "cutoff_function_parameters":{"width": 0.5},
    "global_species":[1,6,7,53,82]
    }
soap = SphericalInvariants(**hypers)

In [None]:
soap_rep = soap.transform(samples)

### Function to perform FPS using distance matrix)

In [None]:
def FPS(Dmatrix, n=0, idx=None):
    """
        Does Farthest Point Selection on a set of points X
        Adapted from a routine by Michele Ceriotti
    """
    N = Dmatrix.shape[0]

    # If desired number of points less than or equal to zero,
    # select all points
    if n <= 0:
        n = N

    # Initialize arrays to store distances and indices
    fps_idxs = np.zeros(n, dtype=np.int32)
    d = np.zeros(n)

    if idx is None:
        # Pick first point at random
        idx = np.random.randint(0, N)
    fps_idxs[0] = idx

    # Compute distance from all points to the first point
    d1 = Dmatrix[idx]   #np.linalg.norm(X - X[idx], axis=1)**2
    # Loop over the remaining points...
    for i in range(1, n):

        # Get maximum distance and corresponding point
        fps_idxs[i] = np.argmax(d1)
        d[i - 1] = np.amax(d1)

        # Compute distance from all points to the selected point
        d2 =  Dmatrix[fps_idxs[i]]

        # Set distances to minimum among the last two selected points
        d1 = np.minimum(d1, d2)

        if d1.max() == 0.0:
            print("Only {} FPS Possible".format(i))
            return fps_idxs[:i], d[:i]

    return fps_idxs

### explain in few words how we use the distance matrix to do FPS (points: 2)

In [None]:
#compute vector of SOAP features
X=...
#group SOAP vectors by molecule and compute average kernel
avg_soap_samples=...
#compute distance matrix (to )
DistMat=...

In [None]:
#DistMat.values returns the numpy array for the Distance Matrix that was embedded in a pandas object
FPS(DistMat.values,n=10,idx=None)

In [None]:
#display DistMat
DistMat

### Check the list of 10 geometries you obtained from FPS and the matrix DistMat. Does the result make sense? (points: 4)