In [1]:
%run ./modules.ipynb



Define paths to input .xyz data and output .npz SOAP vector files:

In [5]:
"""With example MP data"""

path_to_input='../example_data/example_data_MP.xyz'
path_to_output='../example_data/example_SOAP_MP.npz'

"""For a complete analysis, uncomment the following lines:"""
# path_to_input='../r4data/3dcd.xyz'
# path_to_output='../r4data/3DCD_SOAP_tot.npz'
# # OR
# path_to_input='../r4data/MP.xyz'
# path_to_output='../r4data/MP_SOAP_tot.npz'

frames_relax_tot=ase.io.read(path_to_input, index=':')
len(frames_relax_tot)

840

In [3]:
n_FPS=2000   # number of most fiverse features selected 
n_frames=len(frames_relax_tot)    # number of structures analysed, the whole dataset will take some time
n_PC=5    # number of principal components to include in the analysis (min=2)

In [4]:
magic=np.zeros((len(frames_relax_tot)))
for frame, i in zip(frames_relax_tot, range(len(frames_relax_tot))):
    frame.wrap(eps=1e-12)
    if len(frame)%4==0:
        magic[i]=1

In [None]:
species = list(set([int(n) for frame in frames_relax_tot for n in frame.numbers]))

hypers = dict(
    soap_type="PowerSpectrum",
    interaction_cutoff=3.5,
    max_radial=4,
    max_angular=4,
    gaussian_sigma_type="Constant",
    gaussian_sigma_constant=0.5,
    cutoff_smooth_width=0.5,
    global_species=species,
    expansion_by_species_method="user defined",
    normalize=False,
)
fps_soap = SOAP(
    **hypers,
)
idx_for_fps = np.random.randint(0, len(frames_relax_tot), n_FPS) 
frames_for_fps = [frames_relax_tot[i] for i in idx_for_fps]
X_temp = np.array(
    [
        np.mean(fps_soap.transform([frame]).get_features(fps_soap), axis=0)
        for frame in frames_for_fps
    ]
)

THRESH = 1e-12
high_var_features = np.where(np.var(X_temp - X_temp.mean(axis=0), axis=0) > THRESH)[0]
X_temp = X_temp[:, high_var_features]

X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(X_temp)
del X_temp # save on memory
fps = FPS(n_FPS).fit(X_raw)
del X_raw 
u_species = np.unique(species)
sp_pairs = fps_soap.get_keys(u_species)

coefficient_subselection = np.zeros((n_FPS, 5))
index_mapping = get_power_spectrum_index_mapping(
    sp_pairs, n_max=hypers["max_radial"], l_max=hypers["max_angular"] + 1
)
for fi, i in enumerate(high_var_features[fps.selected_idx_]):
    coefficient_subselection[fi] = [
        index_mapping[i][k] for k in ["a", "b", "n1", "n2", "l"]
    ]
coefficient_subselection_dict = {
    "a": coefficient_subselection[:, 0].tolist(),
    "b": coefficient_subselection[:, 1].tolist(),
    "n1": coefficient_subselection[:, 2].tolist(),
    "n2": coefficient_subselection[:, 3].tolist(),
    "l": coefficient_subselection[:, 4].tolist(),
}

soap = SOAP(**hypers, coefficient_subselection=coefficient_subselection_dict)
idx = np.random.randint(0, len(frames_relax_tot), n_frames) 
my_frames= [frames_relax_tot[i] for i in idx]

magic=magic.reshape(-1, 1)[idx]
               
X_raw = np.zeros((len(my_frames), n_FPS))
for fi, frame in enumerate(my_frames):
    X_raw[fi] = np.mean(soap.transform([frame]).get_features(soap), axis=0)

SAVING THE NPZ SOAP VECTOR FILE IN YOUR REPO SO THAT YOU DON'T HAVE TO RUN THE ABOVE EVERY TIME (14 MB FOR FPS=2000, n_PC=5, n_frames=840)

In [35]:
file=np.savez(path_to_output, idx=idx, my_frames=my_frames,
            magic=magic ,X_raw=X_raw, coeff=coefficient_subselection)

  return array(a, dtype, copy=False, order=order, subok=True)
