In [3]:
#!/usr/bin/env python3
import sys
from tqdm.autonotebook import tqdm
# Maths things
import numpy as np

# Atomistic structure manipulation
from ase.io import read, write
from rascal import *

sys.path.append('../')
# from rascal import representations
# Librascal
# import rascal.representations
from rascal.representations import SphericalInvariants as SOAP

# Local Utilities for Notebook
from utilities.general import FPS, center_matrix, normalize_matrix, load_variables
from utilities.kernels import linear_kernel, gaussian_kernel, center_kernel

%aiida
from aiida import orm, load_profile
import matplotlib.pyplot as plt
from aiida.orm import QueryBuilder
from collections import Counter
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns
import ase
from math import pi
import numpy as np
from ase.atoms import Atoms
from ase.calculators import calculator
from ase.calculators.calculator import Calculator, kpts2ndarray
from ase import Atoms
from operator import itemgetter 

ModuleNotFoundError: No module named 'utilities'

In [14]:
#other dataframe

N=10
input_file="./CSD.xyz"
properties = ["CS_local", "CS_total"]
    
# Read the first N frames of CSD-500
frames = read(input_file, index=':{}'.format(N))

# Extract chemical shifts
for frame in frames:
    frame.wrap()

Y = np.vstack([np.concatenate([frame.arrays[property] for frame in frames]) for property in properties]).T

print("Within the {} frames we have {} environments.".format(N, len(Y)))
print(Y)

Within the 10 frames we have 1362 environments.
[[87.69 93.91]
 [84.89 91.12]
 [87.57 93.8 ]
 ...
 [22.49 28.23]
 [23.37 29.12]
 [23.35 29.1 ]]


In [23]:
#dataset load 
skip=100  #usually complete it is 79855
load_profile()
qb = QueryBuilder()
qb.append(orm.StructureData)
qb.count()
qb_red=qb.all()[::skip] #reduced dataset 

In [24]:
# some useful functions
def removeElements(lst): 
    counted = Counter(lst) 
    return [el for el in lst if counted[el] >= len(lst)/1000] 
def point_group(structure):
    global pg
    if sg==1: #spacegroup
        pg='1' #pointgroup
        o=1   #number of operations
    elif sg==2:
        pg='1_'
        o=2
    elif 3<=sg<=5:
        pg='2'
        o=2
    elif 6<=sg<=9:
        pg='m'
        o=2
    elif 10<=sg<=15:
        pg='2/m'
        o=4
    elif 16<=sg<=24:
        pg='222'
        o=4
    elif 25<=sg<=46:
        pg='mm2'
        o=4
    elif 47<=sg<=74:
        pg='mmm'
        o=8
    elif 75<=sg<=80:
        pg='4'
        o=4
    elif 81<=sg<=82:
        pg='4_'
        o=4
    elif 83<=sg<=88:
        pg='4/m'
        o=8
    elif 89<=sg<=98:
        pg='422'
        o=8
    elif 99<=sg<=110:
        pg='4mm'
        o=8
    elif 111<=sg<=122:
        pg='4_2m'
        o=8
    elif 123<=sg<=142:
        pg='4/mmm'
        o=16
    elif 143<=sg<=146:
        pg='3'
        o=3
    elif 147<=sg<=148:
        pg='m3_'
        o=6
    elif 149<=sg<=155:
        pg='32'
        o=6
    elif 156<=sg<=161:
        pg='3m'
        o=6
    elif 162<=sg<=167:
        pg='3_m'
        o=12
    elif 168<=sg<=173:
        pg='6'
        o=6
    elif sg==174:
        pg='6_'
        o=6
    elif 175<=sg<=176:
        pg='6/m'
        o=12
    elif 177<=sg<=182:
        pg='622'
        o=12
    elif 183<=sg<=186:
        pg='6mm'
        o=12
    elif 187<=sg<=190:
        pg='6_m2'
        o=12
    elif 191<=sg<=194:
        pg='6/mmm'
        o=24
    elif 195<=sg<=199:
        pg='23'
        o=12
    elif 200<=sg<=206:
        pg='m_3'
        o=24
    elif 207<=sg<=214:
        pg='432'
        o=24
    elif 215<=sg<=220:
        pg='4_3m'
        o=24
    elif 221<=sg<=230:
        pg='m3_m'
        o=48
    
    return (pg,o)

def magic_four(lista):
    idx=[l for l in range(len(lista)) if lista[l] %4==0]
    return idx

In [25]:
n_atoms=[]

H=[]
He=[]
Li=[]
C=[]
N=[]

H_norm=[]
He_norm=[]
Li_norm=[]
C_norm=[]
N_norm=[]

yes_or_no=[]

general_metal=0
general_even=0
metal=0
even=0
number_structures_with_multiple_of_4=0

point_group_list=[]
  
for [structure_data] in qb_red:
    sg=structure_data.extras['spacegroup_number']
#     print(type(structure_data), type([structure_data]), [structure_data]['data'])
    natom = len(structure_data.get_site_kindnames())
    atoms=structure_data.get_ase()
    number_electrons=sum(atoms.numbers)
    if number_electrons%2!=0:
        general_metal+=1
        if natom %4==0:
            metal+=1
    if number_electrons%2==0:
        general_even+=1
        if natom %4==0:
            even+=1
    if natom %4==0:
        number_structures_with_multiple_of_4+=1
        yes_or_no. append(1)
    else:
        yes_or_no.append(0)
        
    
    sp_group=point_group(sg)
    point_group_list.append(pg)
    
    n_atoms.append(natom)
    hyd=str(structure_data.get_site_kindnames()).count('H')
    H.append(hyd)
#     H_norm.append(hyd/natom)
    hel=str(structure_data.get_site_kindnames()).count('He')
    He.append(hel)
#     He_norm.append(hel/natom)
    li=str(structure_data.get_site_kindnames()).count('Li')
    Li.append(li)
#     Li_norm.append(li/natom)
    carb=str(structure_data.get_site_kindnames()).count('C')
    C.append(carb)
#     C_norm.append(carb/natom)
    nit=str(structure_data.get_site_kindnames()).count('N')
    N.append(nit)
#     N_norm.append(nit/natom)
    
print(n_atoms) 


[24, 12, 24, 12, 14, 14, 22, 36, 104, 6, 64, 7, 44, 28, 5, 22, 22, 10, 44, 20, 60, 22, 6, 4, 64, 6, 12, 128, 132, 54, 10, 20, 8, 64, 4, 4, 18, 34, 68, 20, 5, 3, 52, 168, 12, 13, 106, 17, 240, 5, 5, 3, 130, 80, 20, 16, 52, 16, 7, 5, 32, 212, 38, 52, 8, 14, 8, 6, 6, 40, 12, 10, 14, 28, 104, 6, 24, 80, 64, 36, 14, 7, 7, 4, 68, 32, 5, 14, 30, 22, 6, 12, 68, 26, 9, 28, 112, 12, 10, 16, 22, 36, 92, 62, 28, 76, 36, 57, 4, 24, 36, 32, 12, 36, 68, 64, 12, 16, 46, 42, 22, 52, 22, 2, 32, 9, 18, 196, 38, 9, 20, 50, 10, 28, 12, 18, 116, 22, 2, 11, 24, 120, 39, 10, 18, 13, 32, 24, 2, 5, 4, 64, 44, 8, 64, 5, 44, 84, 72, 5, 12, 12, 32, 28, 112, 10, 24, 22, 15, 37, 28, 5, 448, 18, 6, 6, 88, 8, 16, 28, 46, 9, 24, 20, 12, 6, 48, 10, 4, 10, 80, 21, 38, 9, 29, 24, 204, 36, 68, 10, 36, 28, 38, 16, 138, 72, 68, 12, 64, 10, 10, 40, 22, 22, 48, 22, 48, 18, 14, 8, 36, 22, 13, 10, 4, 5, 122, 82, 3, 6, 38, 10, 10, 22, 4, 20, 24, 10, 54, 14, 72, 18, 52, 28, 56, 38, 4, 140, 22, 24, 18, 80, 1, 12, 56, 10, 2, 88, 18,

In [26]:
data = {'number of atoms': n_atoms,
        'number of C': C, 
        'number of N': N
        }
data = pd.DataFrame(data,columns=['number of atoms', 'number of C', 'number of N'])
Y=data.to_numpy()

In [27]:
n_FPS = 200

# Compute SOAPs (from librascal tutorial)
soap = SOAP(soap_type='PowerSpectrum',
           interaction_cutoff=3.5,
           max_radial=6,
           max_angular=6,
           gaussian_sigma_type='Constant',
           gaussian_sigma_constant=0.4,
           cutoff_smooth_width=0.5)

soap_rep = soap.transform(frames)
X_raw = soap_rep.get_features(soap)

num_features = X_raw.shape[1]
print(soap)

<rascal.representations.spherical_invariants.SphericalInvariants object at 0x7f18e2e73990>


In [29]:
np.savez("./precomputed.npz", n_atoms=[], indices=[], X=X_raw, Y=Y)

In [30]:
print(f"Each SOAP vector contains {num_features} components.\
       \nWe use furthest point sampling to generate a subsample of our SOAP vectors.")
# FPS the components
col_idxs, col_dist = FPS(X_raw.T, n_FPS)
X = X_raw[:, col_idxs]



Each SOAP vector contains 2520 components.       
We use furthest point sampling to generate a subsample of our SOAP vectors.


NameError: name 'FPS' is not defined