This notebook is just a demonstration on how the SOAP vectors are obtained starting from the .xyz files. 
The soap.npz files were already generated and can be downloaded, so there is no real need to run this notebook. 
If you decide to run it anyways using production data, be prepared to wait for a while, depending on the power of your RAM. The output files will be called 'generated_soap*.npz'. 

In [None]:
from data import DATA_3DCD, DATA_MP

In [None]:
%run ./modules.ipynb

In [None]:
if DATA_3DCD.soap.exists() and DATA_MP.soap.exists():
    print("Data alreeady present, no need to execute this notebook.")

In [None]:
frames_3dcd = ase.io.read(DATA_3DCD.structures, index=":")
frames_mp = ase.io.read(DATA_MP.structures, index=":")
len(frames_3dcd), len(frames_mp)

In [None]:
# number of most diverse features selected
n_FPS = 2000

# number of principal components to include in the analysis (min=2)
n_PC = 5

In [None]:
species = list(set([int(n) for frame in [*frames_3dcd, *frames_mp] for n in frame.numbers]))

hypers = dict(
    soap_type="PowerSpectrum",
    interaction_cutoff=3.5,
    max_radial=4,
    max_angular=4,
    gaussian_sigma_type="Constant",
    gaussian_sigma_constant=0.5,
    cutoff_smooth_width=0.5,
    global_species=species,
    expansion_by_species_method="user defined",
    normalize=False,
)
len(species)

In [None]:
fps_soap = SOAP(
    **hypers,
)
idx_3dcd_for_fps = np.random.randint(0, len(frames_3dcd), 2 * n_FPS)
idx_mp_for_fps = np.random.randint(0, len(frames_mp), 2 * n_FPS)

In [None]:
frames_for_fps = [*[frames_3dcd[i] for i in idx_3dcd_for_fps],
                  *[frames_mp[i] for i in idx_mp_for_fps],
                 ]

for frame in frames_for_fps:
    frame.pbc=True
    frame.wrap(eps=1E-10)

In [None]:
X_temp = np.array(
    [
        np.mean(fps_soap.transform([frame]).get_features(fps_soap), axis=0)
        for frame in tqdm(frames_for_fps, desc="Compute SOAP features")
    ]
)

In [None]:
np.savez('x_temp.npz', X_temp)

In [None]:
np.savez('idx.npz', idx_3dcd_for_fps=idx_3dcd_for_fps, idx_mp_for_fps=idx_mp_for_fps)

In [None]:
THRESH = 1e-12
high_var_features = np.where(np.var(X_temp - X_temp.mean(axis=0), axis=0) > THRESH)[0]
X_temp = X_temp[:, high_var_features]

X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(X_temp)
del X_temp  # save on memory
fps = FPS(n_to_select=n_FPS, progress_bar=True).fit(X_raw)
del X_raw
u_species = np.unique(species)
sp_pairs = fps_soap.get_keys(u_species)

In [None]:
coefficient_subselection = np.zeros((n_FPS, 5))
index_mapping = get_power_spectrum_index_mapping(
    sp_pairs, n_max=hypers["max_radial"], l_max=hypers["max_angular"] + 1
)
for fi, i in enumerate(high_var_features[fps.selected_idx_]):
    coefficient_subselection[fi] = [
        index_mapping[i][k] for k in ["a", "b", "n1", "n2", "l"]
    ]
coefficient_subselection_dict = {
    "a": coefficient_subselection[:, 0].tolist(),
    "b": coefficient_subselection[:, 1].tolist(),
    "n1": coefficient_subselection[:, 2].tolist(),
    "n2": coefficient_subselection[:, 3].tolist(),
    "l": coefficient_subselection[:, 4].tolist(),
}

np.savez('../r4data/selected_features.npz', **coefficient_subselection_dict)

In [None]:
soap = SOAP(**hypers, coefficient_subselection=coefficient_subselection_dict)

X_raw = np.zeros((len(frames_3dcd), n_FPS))
for fi, frame in enumerate(tqdm(frames_3dcd, desc="Get SOAP features")):
    X_raw[fi] = np.mean(soap.transform([frame]).get_features(soap), axis=0)

In [None]:
from pathlib import Path

assert DATA_3DCD.soap.resolve() == Path("../r4data/3DCD/soap.npz").resolve()

In [None]:
file = np.savez(
    DATA_3DCD.soap,
    idx=np.arange(len(frames_3dcd)),
    my_frames=frames_3dcd,
    magic=magic,
    X_raw=X_raw,
    coeff=coefficient_subselection,
)

In [None]:
! rm x_temp.npz

In [None]:
for frame in frames_mp:
    frame.pbc=True
    frame.wrap(eps=1E-10)
    
X_raw = np.zeros((len(frames_mp), n_FPS))
for fi, frame in enumerate(tqdm(frames_mp, desc="Get SOAP features")):
    X_raw[fi] = np.mean(soap.transform([frame]).get_features(soap), axis=0)

In [None]:
magic = np.zeros((len(frames_mp)))
for i, frame in enumerate(tqdm(frames_mp)):
    frame.wrap(eps=1e-12)
    if len(frame) % 4 == 0:
        magic[i] = 1

In [None]:
file = np.savez(
    DATA_MP.soap,
    idx=np.arange(len(frames_mp)),
    my_frames=frames_mp,
    magic=magic,
    X_raw=X_raw,
    coeff=coefficient_subselection,
)